# Transform the Extracted Taxi Data

## Organize the data into df using the appropriate data types

In [22]:
import json
import zipfile
import os
import polars as pl

json_file_name = '/Users/mike/Data/Public/Chi_Taxi_Trips'

# Check if the JSON file exists
if os.path.exists(f"{json_file_name}.json"):
    with open(f"{json_file_name}.json", 'r') as f:
        data = json.load(f)
# Check if the ZIP file exists
elif os.path.exists(f"{json_file_name}.zip"):
    with zipfile.ZipFile(f"{json_file_name}.zip", 'r') as zip_ref:
        with zip_ref.open(zip_ref.namelist()[0]) as f:
            data = json.load(f)
else:
    raise FileNotFoundError(f"No such file or directory: '{json_file_name}.json' or '{json_file_name}.zip'")

# Remove unhashable columns
for entry in data:
    entry.pop('pickup_centroid_location', None)
    entry.pop('dropoff_centroid_location', None)

# Convert JSON data to a Polars DataFrame
df = pl.DataFrame(data)

# Drop duplicate rows
df = df.unique()

# Convert columns to appropriate types
df = df.with_columns([
    pl.col('trip_id').cast(pl.Utf8),
    pl.col('taxi_id').cast(pl.Utf8),
    pl.col('trip_start_timestamp').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f", strict=False),
    pl.col('trip_end_timestamp').str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%.f", strict=False),
    pl.col('trip_seconds').cast(pl.Float64),
    pl.col('trip_miles').cast(pl.Float64),
    pl.col('pickup_community_area').cast(pl.Int64),
    pl.col('dropoff_community_area').cast(pl.Int64),
    pl.col('fare').cast(pl.Float64),
    pl.col('tips').cast(pl.Float64),
    pl.col('tolls').cast(pl.Float64),
    pl.col('extras').cast(pl.Float64),
    pl.col('trip_total').cast(pl.Float64),
    pl.col('payment_type').cast(pl.Utf8),
    pl.col('company').cast(pl.Utf8),
    pl.col('pickup_centroid_latitude').cast(pl.Float64),
    pl.col('pickup_centroid_longitude').cast(pl.Float64),
    pl.col('dropoff_centroid_latitude').cast(pl.Float64),
    pl.col('dropoff_centroid_longitude').cast(pl.Float64),
    pl.col('pickup_census_tract').cast(pl.Utf8),
    pl.col('dropoff_census_tract').cast(pl.Utf8)])

# Display the DataFrame
df.head()


trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,i64,i64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8,24,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,,
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32,8,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28,32,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35,35,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,,
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8,44,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,,


In [44]:
df.dtypes
for col, dtype in zip(df.columns, df.dtypes):
    print(f"{col}: {dtype}")

trip_id: String
taxi_id: String
trip_start_timestamp: Datetime(time_unit='us', time_zone=None)
trip_end_timestamp: Datetime(time_unit='us', time_zone=None)
trip_seconds: Float64
trip_miles: Float64
pickup_community_area: Float64
dropoff_community_area: Float64
fare: Float64
tips: Float64
tolls: Float64
extras: Float64
trip_total: Float64
payment_type: String
company: String
pickup_centroid_latitude: Float64
pickup_centroid_longitude: Float64
dropoff_centroid_latitude: Float64
dropoff_centroid_longitude: Float64
pickup_census_tract: String
dropoff_census_tract: String


In [45]:
# Summary statistics for the Polars DataFrame
df.describe()

statistic,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""count""","""179679""","""179679""","""179679""","""179679""",179679.0,179679.0,179679.0,179679.0,179679.0,179679.0,179679.0,179679.0,179679.0,"""179679""","""179679""",179679.0,179679.0,179679.0,179679.0,"""179679""","""179679"""
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0""","""0""",0.0,0.0,0.0,0.0,"""0""","""0"""
"""mean""",,,"""2024-12-23 18:41:14.016997""","""2024-12-23 19:01:34.753978""",1195.683643,6.836591,33.409356,25.045372,21.122806,2.274768,0.016495,1.654709,25.234824,,,41.897539,-87.689464,41.890543,-87.654437,,
"""std""",,,,,1638.338323,8.609549,25.228502,19.334304,17.641721,3.758208,0.29783,4.886151,21.871028,,,0.064939,0.105301,0.057374,0.058529,,
"""min""","""0000b7daa281db74f51c8ee838283a…","""0044e6c0d091476299b99345501f75…","""2024-12-17 07:45:00""","""2024-12-17 07:45:00""",0.0,0.01,1.0,1.0,0.0,0.0,0.0,0.0,0.0,"""Cash""","""2733 - 74600 Benny Jona""",41.660136,-87.913625,41.660136,-87.913625,"""17031010202""","""17031010202"""
"""25%""",,,"""2024-12-19 16:30:00""","""2024-12-19 17:00:00""",499.0,1.3,8.0,8.0,8.25,0.0,0.0,0.0,10.16,,,41.878866,-87.688013,41.878866,-87.656412,,
"""50%""",,,"""2024-12-22 21:30:00""","""2024-12-22 22:00:00""",902.0,3.96,32.0,25.396486,15.0,0.0,0.0,0.0,17.4,,,41.895033,-87.637844,41.890922,-87.634156,,
"""75%""",,,"""2024-12-27 18:30:00""","""2024-12-27 19:00:00""",1560.0,11.51,50.0,32.0,31.0,3.0,0.0,1.0,33.25,,,41.944227,-87.625192,41.90752,-87.625192,,
"""max""","""fffe353e28c1f049ede16af7d5dd41…","""ffda53354c610fd3af1aee46d72302…","""2025-01-01 00:00:00""","""2025-01-03 17:45:00""",84014.0,2166.39,77.0,77.0,1234.0,100.0,41.75,270.0,1234.0,"""Unknown""","""Wolley Taxi""",42.016046,-87.534903,42.016046,-87.534903,"""Unknown""","""Unknown"""


### Set any Numercial Missing Data with Mean Values

In [46]:
# Fill missing values in numeric columns with the mean of each column
numeric_columns = df.select([pl.col(pl.Float64), pl.col(pl.Int64)]).columns
df = df.with_columns([pl.col(col).fill_null(pl.col(col).mean()) for col in numeric_columns])

# Display the DataFrame to verify the changes
df.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35.0,35.0,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,"""Unknown""","""Unknown"""
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8.0,44.0,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,"""Unknown""","""Unknown"""


In [47]:
# Fill missing values in numeric columns with the mean of each column
numeric_columns = df.select([pl.col(pl.Float64), pl.col(pl.Int64)]).columns
df = df.with_columns([pl.col(col).fill_null(pl.col(col).mean()) for col in numeric_columns])

# Display the DataFrame to verify the changes
df.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35.0,35.0,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,"""Unknown""","""Unknown"""
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8.0,44.0,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,"""Unknown""","""Unknown"""


In [48]:
# Calculate missing values statistics
missing_values = df.select([pl.col(col).is_null().sum().alias(col) for col in df.columns])

# Display missing values statistics
display(missing_values)

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [49]:
# Fill missing values in string columns with a placeholder
string_columns = df.select(pl.col(pl.Utf8)).columns
df = df.with_columns([pl.col(col).fill_null('Unknown') for col in string_columns])

# Display the DataFrame to verify the changes
df.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35.0,35.0,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,"""Unknown""","""Unknown"""
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8.0,44.0,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,"""Unknown""","""Unknown"""


In [50]:
# Fill missing values in string columns with a placeholder
string_columns = df.select(pl.col(pl.Utf8)).columns
df = df.with_columns([pl.col(col).fill_null('Unknown') for col in string_columns])

# Display the DataFrame to verify the changes
df.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35.0,35.0,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,"""Unknown""","""Unknown"""
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8.0,44.0,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,"""Unknown""","""Unknown"""


In [51]:
zero_trip_miles_count = df.filter(pl.col('trip_miles') == 0).shape[0]
print(f"Number of rows with trip_miles=0: {zero_trip_miles_count}")

Number of rows with trip_miles=0: 0


In [52]:
original_row_count = df.shape[0]

df = df.filter(pl.col('trip_miles') > 0).with_row_index(name='index').drop('index')

# Calculate the number of rows dropped
rows_dropped = original_row_count - df.shape[0]
print(f"Number of rows dropped: {rows_dropped}")

# Display the DataFrame to verify the changes
df.head()

Number of rows dropped: 0


trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35.0,35.0,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,"""Unknown""","""Unknown"""
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8.0,44.0,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,"""Unknown""","""Unknown"""


In [53]:
zero_trip_miles_count = df.filter(pl.col('trip_miles') == 0).shape[0]
print(f"Number of rows with trip_miles=0: {zero_trip_miles_count}")

Number of rows with trip_miles=0: 0


In [54]:
original_row_count = df.shape[0]

df = df.filter(pl.col('trip_miles') > 0).with_row_index(name='index').drop('index')

# Calculate the number of rows dropped
rows_dropped = original_row_count - df.shape[0]
print(f"Number of rows dropped: {rows_dropped}")

# Display the DataFrame to verify the changes
df.head()

Number of rows dropped: 0


trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100"""
"""a20cd18e92f8f33dab45f78ebde5f3…","""6b29392d56c7d58fe76ec1513ab5c0…",2024-12-29 09:45:00,2024-12-29 09:45:00,348.0,1.16,35.0,35.0,6.5,0.0,0.0,0.0,6.5,"""Prcard""","""Sun Taxi""",41.835118,-87.618678,41.835118,-87.618678,"""Unknown""","""Unknown"""
"""77acd09b0a28d9d3b371d46ae8e090…","""49d226e40bf779a8ba34d280971ff8…",2024-12-28 12:15:00,2024-12-28 12:30:00,1410.0,12.06,8.0,44.0,32.0,0.0,0.0,0.0,32.0,"""Cash""","""Flash Cab""",41.899602,-87.633308,41.740206,-87.61597,"""Unknown""","""Unknown"""


In [55]:
zero_trip_miles_count = df.filter(pl.col('trip_miles') == 0).shape[0]
print(f"Number of rows with trip_miles=0: {zero_trip_miles_count}")

Number of rows with trip_miles=0: 0


In [56]:
zero_trip_miles_count = df.filter(pl.col('trip_miles') == 0).shape[0]
print(f"Number of rows with trip_miles=0: {zero_trip_miles_count}")

Number of rows with trip_miles=0: 0


In [60]:
# Calculate the number of null values in the community area columns
null_pickup_community_area = df.filter(pl.col('pickup_community_area').is_null()).shape[0]
null_dropoff_community_area = df.filter(pl.col('dropoff_community_area').is_null()).shape[0]

print(f"Number of null values in pickup_community_area: {null_pickup_community_area}")
print(f"Number of null values in dropoff_community_area: {null_dropoff_community_area}")

Number of null values in pickup_community_area: 0
Number of null values in dropoff_community_area: 0


### Create a Dim for Time

In [61]:
import polars as pl

# Extract start and end timestamps
start_times = df['trip_start_timestamp']
end_times = df['trip_end_timestamp']

# Combine start and end timestamps into a single series
all_times = pl.concat([start_times, end_times]).unique().sort()

# Create the time dimension dataframe
dim_time = pl.DataFrame({
    'fk_trip_timestamp': all_times,
    'date': all_times.dt.date(),
    'time': all_times.dt.time(),
    'month': all_times.dt.month(),
    'hour': all_times.dt.hour(),
    'minute': all_times.dt.minute()
})

# Set fk_trip_timestamp as the index
dim_time = dim_time.set_sorted('fk_trip_timestamp')

# Display the time dimension dataframe
dim_time.head()

fk_trip_timestamp,date,time,month,hour,minute
datetime[μs],date,time,i8,i8,i8
2024-12-17 07:45:00,2024-12-17,07:45:00,12,7,45
2024-12-17 08:00:00,2024-12-17,08:00:00,12,8,0
2024-12-17 08:15:00,2024-12-17,08:15:00,12,8,15
2024-12-17 08:30:00,2024-12-17,08:30:00,12,8,30
2024-12-17 08:45:00,2024-12-17,08:45:00,12,8,45


In [62]:
# Add a column for the day of the week as an abbreviated string
dim_time = dim_time.with_columns([
    dim_time['fk_trip_timestamp'].dt.strftime('%a').alias('day_of_week')
])

# Display the updated time dimension dataframe
dim_time.head()

fk_trip_timestamp,date,time,month,hour,minute,day_of_week
datetime[μs],date,time,i8,i8,i8,str
2024-12-17 07:45:00,2024-12-17,07:45:00,12,7,45,"""Tue"""
2024-12-17 08:00:00,2024-12-17,08:00:00,12,8,0,"""Tue"""
2024-12-17 08:15:00,2024-12-17,08:15:00,12,8,15,"""Tue"""
2024-12-17 08:30:00,2024-12-17,08:30:00,12,8,30,"""Tue"""
2024-12-17 08:45:00,2024-12-17,08:45:00,12,8,45,"""Tue"""


### Create a Dimension for location data

In [85]:
# Extract unique combinations of latitude, longitude, and census tract from pickup locations
pickup_locations = df.select(['pickup_centroid_latitude', 'pickup_centroid_longitude', 'pickup_census_tract']).unique()

# Create the location dimension dataframe
dim_location = pickup_locations.with_columns([
    pl.col('pickup_centroid_latitude').alias('fk_latitude'),
    pl.col('pickup_centroid_longitude').alias('fk_longitude'),
    pl.col('pickup_census_tract').alias('census_tract'),
    pl.lit(None).cast(pl.Int64).alias('community_area')
])

# Drop the original latitude, longitude, and census tract columns
dim_location = dim_location.drop(['pickup_centroid_latitude', 'pickup_centroid_longitude', 'pickup_census_tract'])

# Display the location dimension dataframe
dim_location.head()


fk_latitude,fk_longitude,census_tract,community_area
f64,f64,str,i64
41.968069,-87.721559,"""Unknown""",
41.884987,-87.620993,"""17031320100""",
41.892073,-87.628874,"""17031081600""",
41.938232,-87.646782,"""17031063100""",
41.942692,-87.651771,"""17031062100""",


In [86]:
null_community_areas_count = dim_location.filter(pl.col('community_area').is_null()).shape[0]
print(f"Number of community areas in dim_location that are null: {null_community_areas_count}")

Number of community areas in dim_location that are null: 226


In [87]:
# Fill null values in dim_location with 0
dim_location = dim_location.fill_null(0)

# Display the updated dim_location dataframe
dim_location.head()

fk_latitude,fk_longitude,census_tract,community_area
f64,f64,str,i64
41.968069,-87.721559,"""Unknown""",0
41.884987,-87.620993,"""17031320100""",0
41.892073,-87.628874,"""17031081600""",0
41.938232,-87.646782,"""17031063100""",0
41.942692,-87.651771,"""17031062100""",0


### Create a Dim for Payment Types

In [88]:
# Extract unique payment types
unique_payment_types = df.select(pl.col('payment_type')).unique()

# Create the payment type dimension dataframe
dim_payment_type = unique_payment_types.with_columns([
    pl.arange(1, unique_payment_types.height + 1).alias('payment_type_key')
])

# Display the payment type dimension dataframe
dim_payment_type.head()


payment_type,payment_type_key
str,i64
"""Unknown""",1
"""Prcard""",2
"""No Charge""",3
"""Cash""",4
"""Credit Card""",5


### Create a Dim for Taxi Companies based on the Taxi Id

In [89]:
# Create the taxi dimension dataframe
dim_taxi = df.select(['taxi_id', 'company']).unique().with_columns([
    pl.col('taxi_id').alias('fk_taxi_id')
]).drop('taxi_id')

# Rearrange columns to have fk_taxi_id first
dim_taxi = dim_taxi.select(['fk_taxi_id', 'company'])

# Display the taxi dimension dataframe
dim_taxi.head(10)

fk_taxi_id,company
str,str
"""b52493d43f7de565ab5eaaa0b12387…","""Tac - Yellow Cab Association"""
"""9d0ae86bab93b06fc8d952808f4fcc…","""Taxicab Insurance Agency Llc"""
"""19b4ae2f19dd457dced6a79a9a5fea…","""Chicago Independents"""
"""8b1a88e5a09cfd55ca72d267f00f56…","""5 Star Taxi"""
"""365689b9f3107b807470fe16b781f7…","""Blue Ribbon Taxi Association"""
"""a31d2ea87ea4f5a4793c30f84f000b…","""Flash Cab"""
"""cea7a9702e82637b2250c3f993e1ea…","""Taxicab Insurance Agency Llc"""
"""00f4b381570486f8575cbaa57ed41f…","""Taxi Affiliation Services"""
"""0d155321bfc93b437e0630bd155d06…","""Flash Cab"""
"""f454eed0504cea35dec37d008e9408…","""5 Star Taxi"""


### Create the Fact Table merging with Dim tables

#### The number of records is around 200K. This means the merge operations must be done in chunks, along with other mem optimizations

##### What happends if we increase the number of rows to 1M or 2M? (We will see later....)

In [90]:
# Join df with dim_time on trip_start_timestamp and trip_end_timestamp
fact_table = df.join(dim_time, left_on='trip_start_timestamp', right_on='fk_trip_timestamp', how='left') \
               .join(dim_time, left_on='trip_end_timestamp', right_on='fk_trip_timestamp', how='left', suffix='_end')

# Join with dim_location on pickup and dropoff coordinates
fact_table = fact_table.join(dim_location, left_on=['pickup_centroid_latitude', 'pickup_centroid_longitude'], right_on=['fk_latitude', 'fk_longitude'], how='left', suffix='_pickup') \
.join(dim_location, left_on=['dropoff_centroid_latitude', 'dropoff_centroid_longitude'], right_on=['fk_latitude', 'fk_longitude'], how='left', suffix='_dropoff')

# Join with dim_taxi on taxi_id
fact_table = fact_table.join(dim_taxi, left_on='taxi_id', right_on='fk_taxi_id', how='left')

# Join with dim_payment_type on payment_type
fact_table = fact_table.join(dim_payment_type, left_on='payment_type', right_on='payment_type', how='left')
fact_table.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract,date,time,month,hour,minute,day_of_week,date_end,time_end,month_end,hour_end,minute_end,day_of_week_end,census_tract,community_area,census_tract_dropoff,community_area_dropoff,company_right,payment_type_key
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,date,time,i8,i8,i8,str,date,time,i8,i8,i8,str,str,i64,str,i64,str,i64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Tac - American United Dispatch""",6
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,8.0,24.0,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Taxi Affiliation Services""",6
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,32.0,8.0,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201""",2024-12-30,16:15:00,12,16,15,"""Mon""",2024-12-30,16:30:00,12,16,30,"""Mon""","""17031320100""",0,"""17031081201""",0,"""Taxicab Insurance Agency Llc""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Tac - Yellow Cab Association""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,28.0,32.0,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Taxi Affiliation Services""",5


#### Drop unneeded cols from the fact table  

In [91]:
fact_table = fact_table.drop(['pickup_community_area', 'dropoff_community_area'])

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract,date,time,month,hour,minute,day_of_week,date_end,time_end,month_end,hour_end,minute_end,day_of_week_end,census_tract,community_area,census_tract_dropoff,community_area_dropoff,company_right,payment_type_key
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,date,time,i8,i8,i8,str,date,time,i8,i8,i8,str,str,i64,str,i64,str,i64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Tac - American United Dispatch""",6
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,"""Mobile""","""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Taxi Affiliation Services""",6
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,8.0,1.0,0.0,2.0,11.5,"""Credit Card""","""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201""",2024-12-30,16:15:00,12,16,15,"""Mon""",2024-12-30,16:30:00,12,16,30,"""Mon""","""17031320100""",0,"""17031081201""",0,"""Taxicab Insurance Agency Llc""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Tac - Yellow Cab Association""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,"""Credit Card""","""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Taxi Affiliation Services""",5


In [92]:
fact_table = fact_table.drop('payment_type')

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract,date,time,month,hour,minute,day_of_week,date_end,time_end,month_end,hour_end,minute_end,day_of_week_end,census_tract,community_area,census_tract_dropoff,community_area_dropoff,company_right,payment_type_key
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,str,str,date,time,i8,i8,i8,str,date,time,i8,i8,i8,str,str,i64,str,i64,str,i64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,"""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Tac - American United Dispatch""",6
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,"""Tac - American United Dispatch""",41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Taxi Affiliation Services""",6
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,8.0,1.0,0.0,2.0,11.5,"""Taxicab Insurance Agency Llc""",41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201""",2024-12-30,16:15:00,12,16,15,"""Mon""",2024-12-30,16:30:00,12,16,30,"""Mon""","""17031320100""",0,"""17031081201""",0,"""Taxicab Insurance Agency Llc""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,"""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Tac - Yellow Cab Association""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,"""Taxi Affiliation Services""",41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Taxi Affiliation Services""",5


In [93]:
fact_table = fact_table.drop('company')

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract,date,time,month,hour,minute,day_of_week,date_end,time_end,month_end,hour_end,minute_end,day_of_week_end,census_tract,community_area,census_tract_dropoff,community_area_dropoff,company_right,payment_type_key
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,date,time,i8,i8,i8,str,date,time,i8,i8,i8,str,str,i64,str,i64,str,i64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Tac - American United Dispatch""",6
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""",2024-12-20,19:30:00,12,19,30,"""Fri""",2024-12-20,19:45:00,12,19,45,"""Fri""","""Unknown""",0,"""Unknown""",0,"""Taxi Affiliation Services""",6
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,8.0,1.0,0.0,2.0,11.5,41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201""",2024-12-30,16:15:00,12,16,15,"""Mon""",2024-12-30,16:30:00,12,16,30,"""Mon""","""17031320100""",0,"""17031081201""",0,"""Taxicab Insurance Agency Llc""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Tac - Yellow Cab Association""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""",2024-12-20,09:00:00,12,9,0,"""Fri""",2024-12-20,09:00:00,12,9,0,"""Fri""","""17031281900""",0,"""17031839100""",0,"""Taxi Affiliation Services""",5


In [94]:
columns_to_drop = ['date', 'time', 'month', 'hour', 'minute', 'day_of_week', 'date_end', 'time_end', 'month_end', 'hour_end', 'minute_end', 'day_of_week_end']
fact_table = fact_table.drop(columns_to_drop)

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract,census_tract,community_area,census_tract_dropoff,community_area_dropoff,company_right,payment_type_key
str,str,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,i64,str,i64,str,i64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""","""Unknown""",0,"""Unknown""",0,"""Tac - American United Dispatch""",6
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",2024-12-20 19:30:00,2024-12-20 19:45:00,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.899602,-87.633308,41.901207,-87.676356,"""Unknown""","""Unknown""","""Unknown""",0,"""Unknown""",0,"""Taxi Affiliation Services""",6
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",2024-12-30 16:15:00,2024-12-30 16:30:00,742.0,1.04,8.0,1.0,0.0,2.0,11.5,41.884987,-87.620993,41.899156,-87.626211,"""17031320100""","""17031081201""","""17031320100""",0,"""17031081201""",0,"""Taxicab Insurance Agency Llc""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""","""17031281900""",0,"""17031839100""",0,"""Tac - Yellow Cab Association""",5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",2024-12-20 09:00:00,2024-12-20 09:00:00,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.879255,-87.642649,41.880994,-87.632746,"""17031281900""","""17031839100""","""17031281900""",0,"""17031839100""",0,"""Taxi Affiliation Services""",5


In [103]:
columns_to_drop = ['community_area', 'community_area_dropoff', 'company_right', 'census_tract']
existing_columns_to_drop = [col for col in columns_to_drop if col in fact_table.columns]
fact_table = fact_table.drop(existing_columns_to_drop)

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract,census_tract_dropoff
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.901207,-87.676356,"""Unknown""","""Unknown""","""Unknown"""
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.901207,-87.676356,"""Unknown""","""Unknown""","""Unknown"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,742.0,1.04,8.0,1.0,0.0,2.0,11.5,41.899156,-87.626211,"""17031320100""","""17031081201""","""17031081201"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.880994,-87.632746,"""17031281900""","""17031839100""","""17031839100"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.880994,-87.632746,"""17031281900""","""17031839100""","""17031839100"""


In [104]:
columns_to_drop = ['pickup_census_tract', 'dropoff_census_tract', 'census_tract_dropoff']
existing_columns_to_drop = [col for col in columns_to_drop if col in fact_table.columns]
fact_table = fact_table.drop(existing_columns_to_drop)

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,dropoff_centroid_latitude,dropoff_centroid_longitude
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.901207,-87.676356
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,980.0,2.35,10.47,1.29,0.0,0.0,11.76,41.901207,-87.676356
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,742.0,1.04,8.0,1.0,0.0,2.0,11.5,41.899156,-87.626211
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.880994,-87.632746
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,180.0,0.5,4.75,0.0,0.0,0.0,4.75,41.880994,-87.632746


In [105]:
# Specify the columns to move to the end
columns_to_move = ['trip_seconds', 'trip_miles', 'fare', 'tips', 'tolls', 'extras', 'trip_total']

# Get the remaining columns
remaining_columns = [col for col in fact_table.columns if col not in columns_to_move]

# Rearrange the columns
fact_table = fact_table.select(remaining_columns + columns_to_move)

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,41.899156,-87.626211,742.0,1.04,8.0,1.0,0.0,2.0,11.5
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75


#### Validate num of rows in Fact

In [106]:
num_rows = fact_table.shape[0]
print(f"Number of rows in the fact table: {num_rows}")

Number of rows in the fact table: 204153


## Lets add some new metrics to the Fact Tables

In [110]:
# Calculate trip duration in minutes
fact_table = fact_table.with_columns([
    (pl.col('trip_seconds') / 60).alias('trip_duration_minutes')
])

# Ensure trip_seconds is never zero
fact_table = fact_table.with_columns([
    pl.when(pl.col('trip_seconds') == 0).then(1).otherwise(pl.col('trip_seconds')).alias('trip_seconds')
])

# Calculate average speed (miles per hour)
fact_table = fact_table.with_columns([
    (pl.col('trip_miles') / (pl.col('trip_seconds') / 3600)).alias('average_speed_mph')
])

# Handle missing values
fact_table = fact_table.with_columns([
    pl.col('fare').fill_null(pl.col('fare').mean()).alias('fare'),
    pl.col('tips').fill_null(0).alias('tips'),
    pl.col('tolls').fill_null(0).alias('tolls'),
    pl.col('extras').fill_null(0).alias('extras'),
    pl.col('trip_total').fill_null(pl.col('fare') + pl.col('tips') + pl.col('tolls') + pl.col('extras')).alias('trip_total')
])

# Display the transformed fact table
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,trip_duration_minutes,average_speed_mph
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,16.333333,8.632653
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,16.333333,8.632653
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,41.899156,-87.626211,742.0,1.04,8.0,1.0,0.0,2.0,11.5,12.366667,5.045822
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,3.0,10.0
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,3.0,10.0


In [117]:
# Define the bins and labels for trip length categories
bins = [0, 300, 900, 1800, float('inf')]  # in seconds
labels = ['short', 'medium', 'long', 'very long']

# Create a new column 'trip_length_category' based on the bins and labels
fact_table = fact_table.with_columns([
    pl.when(pl.col('trip_seconds') <= bins[1]).then(pl.lit(labels[0]))
      .when(pl.col('trip_seconds') <= bins[2]).then(pl.lit(labels[1]))
      .when(pl.col('trip_seconds') <= bins[3]).then(pl.lit(labels[2]))
      .otherwise(pl.lit(labels[3])).alias('trip_length_category')
])

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,trip_duration_minutes,average_speed_mph,trip_length_category
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,16.333333,8.632653,"""long"""
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,16.333333,8.632653,"""long"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,41.899156,-87.626211,742.0,1.04,8.0,1.0,0.0,2.0,11.5,12.366667,5.045822,"""medium"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,3.0,10.0,"""short"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,3.0,10.0,"""short"""


In [118]:
# Define the bins and labels for average speed categories
speed_bins = [0, 10, 20, 30, float('inf')]  # in miles per hour
speed_labels = ['slow', 'moderate', 'fast', 'very fast']

# Create a new column 'speed_category' based on the bins and labels
fact_table = fact_table.with_columns([
    pl.when(pl.col('average_speed_mph') <= speed_bins[1]).then(pl.lit(speed_labels[0]))
      .when(pl.col('average_speed_mph') <= speed_bins[2]).then(pl.lit(speed_labels[1]))
      .when(pl.col('average_speed_mph') <= speed_bins[3]).then(pl.lit(speed_labels[2]))
      .otherwise(pl.lit(speed_labels[3])).alias('speed_category')
])

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,trip_duration_minutes,average_speed_mph,trip_length_category,speed_category
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,16.333333,8.632653,"""long""","""slow"""
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,16.333333,8.632653,"""long""","""slow"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,41.899156,-87.626211,742.0,1.04,8.0,1.0,0.0,2.0,11.5,12.366667,5.045822,"""medium""","""slow"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,3.0,10.0,"""short""","""slow"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,3.0,10.0,"""short""","""slow"""


In [119]:
fact_table = fact_table.drop('trip_duration_minutes')

# Display the updated fact table to verify the changes
fact_table.head()

trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,average_speed_mph,trip_length_category,speed_category
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,8.632653,"""long""","""slow"""
"""5c0b9d9d95e401aa64da33eb3ffebb…","""545ac2dfd5b722e0f0d884cc68ec27…",6,2024-12-20 19:30:00,2024-12-20 19:45:00,41.899602,-87.633308,41.901207,-87.676356,980.0,2.35,10.47,1.29,0.0,0.0,11.76,8.632653,"""long""","""slow"""
"""6952ad9bc387cdc714604ec7af4053…","""d4578429775c32e03fb5139e1d132b…",5,2024-12-30 16:15:00,2024-12-30 16:30:00,41.884987,-87.620993,41.899156,-87.626211,742.0,1.04,8.0,1.0,0.0,2.0,11.5,5.045822,"""medium""","""slow"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,10.0,"""short""","""slow"""
"""12535c80e6bed94672cc8f43d2b9be…","""083b7260314e48be5e10a9191da36f…",5,2024-12-20 09:00:00,2024-12-20 09:00:00,41.879255,-87.642649,41.880994,-87.632746,180.0,0.5,4.75,0.0,0.0,0.0,4.75,10.0,"""short""","""slow"""


In [121]:
# Check for duplicates in the fact table
fact_table_unique = fact_table.unique()
duplicates_count = fact_table.shape[0] - fact_table_unique.shape[0]

print(f"Number of duplicate rows in the fact table: {duplicates_count}")

# Drop duplicates
fact_table = fact_table_unique

# Display the updated fact table to verify the changes
fact_table.head()

Number of duplicate rows in the fact table: 24474


trip_id,taxi_id,payment_type_key,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,average_speed_mph,trip_length_category,speed_category
str,str,i64,datetime[μs],datetime[μs],f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str
"""ece78c7239e0f2470736a936f5296f…","""c52371aa17a6dd092799762c903f7c…",6,2024-12-19 07:30:00,2024-12-19 07:45:00,41.899602,-87.633308,41.899602,-87.633308,372.0,1.19,10.0,1.0,0.0,0.0,11.0,11.516129,"""medium""","""moderate"""
"""86f86ba3322a7e18efea4f166bd5a3…","""f1466ba65ee3a4d113741eee02608c…",5,2024-12-21 20:15:00,2024-12-21 20:45:00,41.980264,-87.913625,41.983636,-87.723583,1136.0,9.7,25.75,9.08,0.0,4.0,39.33,30.739437,"""long""","""very fast"""
"""02f98d6018d2bc6d6e7583e6ca2414…","""76a1fbc06546e3f73c964706a0e0a8…",4,2024-12-23 08:00:00,2024-12-23 08:00:00,41.880994,-87.632746,41.880994,-87.632746,329.0,0.63,5.5,0.0,0.0,1.0,6.5,6.893617,"""medium""","""slow"""
"""e3881a9c8ca9c75559a7ed527f871c…","""a04ded5c1365641c170cd53f6befbc…",6,2024-12-19 22:00:00,2024-12-19 22:30:00,41.878866,-87.625192,41.986712,-87.663416,1045.0,8.76,27.64,0.0,0.0,0.0,27.64,30.17799,"""long""","""very fast"""
"""eb2a1e57b866b83bf6b7d01fd5cd79…","""e30325d7d21a95e76154ac70fd5f3c…",5,2024-12-26 17:45:00,2024-12-26 18:45:00,41.879255,-87.642649,41.871016,-87.631407,3387.0,5.36,23.25,3.0,0.0,4.0,30.75,5.697077,"""very long""","""slow"""


In [122]:
num_rows_fact_table = fact_table.shape[0]
print(f"Number of rows in the fact table: {num_rows_fact_table}")

Number of rows in the fact table: 179679
