# Transform the Extracted Taxi Data

## Organize the data into df using the appropriate data types

In [3]:
import json
import zipfile
import os
import pandas as pd

json_file_name = '/Users/mike/Data/Public/Chi_Taxi_Trips'

# Check if the JSON file exists
if os.path.exists(f"{json_file_name}.json"):
    with open(f"{json_file_name}.json", 'r') as f:
        data = json.load(f)
# Check if the ZIP file exists
elif os.path.exists(f"{json_file_name}.zip"):
    with zipfile.ZipFile(f"{json_file_name}.zip", 'r') as zip_ref:
        with zip_ref.open(zip_ref.namelist()[0]) as f:
            data = json.load(f)
else:
    raise FileNotFoundError(f"No such file or directory: '{json_file_name}.json' or '{json_file_name}.zip'")

# Remove unhashable columns
for entry in data:
    entry.pop('pickup_centroid_location', None)
    entry.pop('dropoff_centroid_location', None)

# Convert JSON data to a DataFrame
df = pd.DataFrame(data)

# Drop duplicate rows
df = df.drop_duplicates()

# Convert columns to appropriate types
df['trip_id'] = df['trip_id'].astype(str)
df['taxi_id'] = df['taxi_id'].astype(str)
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])
df['trip_seconds'] = pd.to_numeric(df['trip_seconds'], errors='coerce')
df['trip_miles'] = pd.to_numeric(df['trip_miles'], errors='coerce')
df['pickup_community_area'] = pd.to_numeric(df['pickup_community_area'], errors='coerce').astype('Int64')
df['dropoff_community_area'] = pd.to_numeric(df['dropoff_community_area'], errors='coerce').astype('Int64')
df['fare'] = pd.to_numeric(df['fare'], errors='coerce')
df['tips'] = pd.to_numeric(df['tips'], errors='coerce')
df['tolls'] = pd.to_numeric(df['tolls'], errors='coerce')
df['extras'] = pd.to_numeric(df['extras'], errors='coerce')
df['trip_total'] = pd.to_numeric(df['trip_total'], errors='coerce')
df['payment_type'] = df['payment_type'].astype(str)
df['company'] = df['company'].astype(str)
df['pickup_centroid_latitude'] = pd.to_numeric(df['pickup_centroid_latitude'], errors='coerce')
df['pickup_centroid_longitude'] = pd.to_numeric(df['pickup_centroid_longitude'], errors='coerce')
df['dropoff_centroid_latitude'] = pd.to_numeric(df['dropoff_centroid_latitude'], errors='coerce')
df['dropoff_centroid_longitude'] = pd.to_numeric(df['dropoff_centroid_longitude'], errors='coerce')
df['pickup_census_tract'] = df['pickup_census_tract'].astype(str)
df['dropoff_census_tract'] = df['dropoff_census_tract'].astype(str)

# Display the DataFrame
display(df.head())


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,...,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
0,f3a9abf8f20738d79ead51dc52a152b869405079,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,2025-01-01,2025-01-01 00:15:00,601.0,5.14,76.0,10,15.25,3.11,...,5.0,23.86,Credit Card,Blue Ribbon Taxi Association,41.980264,-87.913625,41.985015,-87.804532,,
1,310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,2025-01-01,2025-01-01 00:00:00,340.0,1.45,28.0,28,20.0,4.1,...,0.0,24.6,Mobile,Sun Taxi,41.874005,-87.663518,41.874005,-87.663518,,
2,53fc5da5892235cce7b470dc94440e491eae9532,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,2025-01-01,2025-01-01 00:45:00,2886.0,18.28,76.0,32,51.5,5.6,...,4.0,61.6,Mobile,Medallion Leasin,41.980264,-87.913625,41.878866,-87.625192,,
3,65b4347a7a8525e779b1be2b3b93b39bb50ebeeb,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,2025-01-01,2025-01-01 00:00:00,795.0,15.09,,34,37.0,0.0,...,0.0,37.0,Cash,Sun Taxi,,,41.842076,-87.633973,,
4,604c104c64de68af93a0f69ed52ba585f9f26427,d3b33673ee1a39325983e66d426003caf668f4509230cb...,2025-01-01,2025-01-01 00:30:00,1920.0,8.0,41.0,43,25.5,0.0,...,0.0,25.5,Unknown,Taxi Affiliation Services,41.79409,-87.592311,41.761578,-87.572782,,


### Check the schema

In [4]:
display(df.dtypes)

trip_id                               object
taxi_id                               object
trip_start_timestamp          datetime64[ns]
trip_end_timestamp            datetime64[ns]
trip_seconds                         float64
trip_miles                           float64
pickup_community_area                  Int64
dropoff_community_area                 Int64
fare                                 float64
tips                                 float64
tolls                                float64
extras                               float64
trip_total                           float64
payment_type                          object
company                               object
pickup_centroid_latitude             float64
pickup_centroid_longitude            float64
dropoff_centroid_latitude            float64
dropoff_centroid_longitude           float64
pickup_census_tract                   object
dropoff_census_tract                  object
dtype: object

### Get some Stats about the Data

In [5]:
# Summary statistics for numeric columns
print("Summary Statistics for Numeric Columns:")
display(df.describe())

# Missing values
print("\nMissing Values:")
missing_values = df.isnull().sum()
display(missing_values[missing_values > 0])

# Unique values
print("\nUnique Values:")
unique_values = df.nunique()
display(unique_values)

# Data types
print("\nData Types:")
display(df.dtypes)

# Correlation matrix for numeric columns
print("\nCorrelation Matrix:")
numeric_columns = df.select_dtypes(include=['number']).columns
correlation_matrix = df[numeric_columns].corr()
display(correlation_matrix)

# Value counts for categorical columns
print("\nValue Counts for Categorical Columns:")
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
for col in categorical_columns:
    print(f"\nValue Counts for {col}:")
    display(df[col].value_counts())

Summary Statistics for Numeric Columns:


Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude
count,196790,196788,196745.0,196787.0,191219.0,181106.0,196277.0,196277.0,196277.0,196277.0,196277.0,191273.0,191273.0,181495.0,181495.0
mean,2024-12-23 19:17:35.943899904,2024-12-23 19:36:58.333434880,1134.096343,6.242146,33.091183,25.396486,21.218281,2.219734,0.015494,1.677467,25.291858,41.897107,-87.688013,41.890595,-87.655739
min,2024-12-17 07:45:00,2024-12-17 07:45:00,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,41.660136,-87.913625,41.660136,-87.913625
25%,2024-12-19 17:00:00,2024-12-19 17:15:00,449.0,1.01,8.0,8.0,7.75,0.0,0.0,0.0,9.75,41.878866,-87.681356,41.877406,-87.663416
50%,2024-12-22 23:00:00,2024-12-22 23:15:00,840.0,2.99,32.0,28.0,14.0,0.0,0.0,0.0,16.5,41.895033,-87.634156,41.892508,-87.633308
75%,2024-12-27 19:00:00,2024-12-27 19:15:00,1507.0,11.07,49.0,32.0,30.75,3.0,0.0,1.0,32.61,41.944227,-87.625192,41.922686,-87.625192
max,2025-01-01 00:00:00,2025-01-03 17:45:00,84014.0,2166.39,77.0,77.0,7525.0,100.0,41.75,270.0,7525.0,42.016046,-87.534903,42.016046,-87.534903
std,,,1665.11844,8.449312,25.271747,20.37445,41.683905,3.79318,0.287547,5.425856,43.590133,0.065128,0.104763,0.05982,0.063138



Missing Values:


trip_end_timestamp                2
trip_seconds                     45
trip_miles                        3
pickup_community_area          5571
dropoff_community_area        15684
fare                            513
tips                            513
tolls                           513
extras                          513
trip_total                      513
pickup_centroid_latitude       5517
pickup_centroid_longitude      5517
dropoff_centroid_latitude     15295
dropoff_centroid_longitude    15295
dtype: int64


Unique Values:


trip_id                       196790
taxi_id                         2536
trip_start_timestamp            1410
trip_end_timestamp              1419
trip_seconds                    5485
trip_miles                      3596
pickup_community_area             77
dropoff_community_area            77
fare                            4580
tips                            1836
tolls                             46
extras                           502
trip_total                      6531
payment_type                       7
company                           38
pickup_centroid_latitude         249
pickup_centroid_longitude        249
dropoff_centroid_latitude        304
dropoff_centroid_longitude       304
pickup_census_tract              182
dropoff_census_tract             250
dtype: int64


Data Types:


trip_id                               object
taxi_id                               object
trip_start_timestamp          datetime64[ns]
trip_end_timestamp            datetime64[ns]
trip_seconds                         float64
trip_miles                           float64
pickup_community_area                  Int64
dropoff_community_area                 Int64
fare                                 float64
tips                                 float64
tolls                                float64
extras                               float64
trip_total                           float64
payment_type                          object
company                               object
pickup_centroid_latitude             float64
pickup_centroid_longitude            float64
dropoff_centroid_latitude            float64
dropoff_centroid_longitude           float64
pickup_census_tract                   object
dropoff_census_tract                  object
dtype: object


Correlation Matrix:


Unnamed: 0,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude
trip_seconds,1.0,0.366714,0.229907,0.073476,0.191242,0.165725,0.024861,0.109069,0.211239,0.1092,-0.268995,0.0377,-0.133272
trip_miles,0.366714,1.0,0.421576,0.143922,0.29163,0.315968,0.078201,0.275212,0.341738,0.130027,-0.445706,-0.017483,-0.163404
pickup_community_area,0.229907,0.421576,1.0,0.110819,0.212365,0.355645,0.070401,0.364025,0.283653,0.08412,-0.734563,-0.041495,-0.083049
dropoff_community_area,0.073476,0.143922,0.110819,1.0,0.079591,-0.031839,0.024853,-0.026136,0.072845,-0.199637,0.050414,-0.338203,-0.393952
fare,0.191242,0.29163,0.212365,0.079591,1.0,0.15525,0.033682,0.111836,0.9842,0.05585,-0.222516,-0.009419,-0.089219
tips,0.165725,0.315968,0.355645,-0.031839,0.15525,1.0,0.085069,0.364316,0.284196,0.276971,-0.474351,0.169819,-0.140813
tolls,0.024861,0.078201,0.070401,0.024853,0.033682,0.085069,1.0,0.147156,0.064848,0.05212,-0.089719,0.009004,-0.041798
extras,0.109069,0.275212,0.364025,-0.026136,0.111836,0.364316,0.147156,1.0,0.265012,0.221755,-0.44914,0.098657,-0.084406
trip_total,0.211239,0.341738,0.283653,0.072845,0.9842,0.284196,0.064848,0.265012,1.0,0.109493,-0.316215,0.012839,-0.105419
pickup_centroid_latitude,0.1092,0.130027,0.08412,-0.199637,0.05585,0.276971,0.05212,0.221755,0.109493,1.0,-0.592178,0.480385,-0.219505



Value Counts for Categorical Columns:

Value Counts for trip_id:


trip_id
0b4343256c21250cf6b159c57e68bfa4fa38644c    1
f3a9abf8f20738d79ead51dc52a152b869405079    1
310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd    1
53fc5da5892235cce7b470dc94440e491eae9532    1
65b4347a7a8525e779b1be2b3b93b39bb50ebeeb    1
                                           ..
fbddf782bf4676e94412343f85e7918701c11479    1
30522d344338e147d42466377280d2fa196cb5da    1
b199d37bd0ac16e4314abb25d45a11a4871bff5d    1
f6391decd6a17009e92b94ccc41cb5885f26f972    1
15b8da9ff4ad4456ed23d06984eccbee0f6faead    1
Name: count, Length: 196790, dtype: int64


Value Counts for taxi_id:


taxi_id
14685d7c19b1bcf05466194de4a62ee39d4144f97867505373401607acec327e9b715f57d5ae38f00020eb0a6325c0923967d4cdfbefb2ed0930d80a9fcdae42    405
d40dae7ea46d61abca67eb53b157fe9cf0b485cca6dce122604588a69aa6c4b6b78e0e5c5fd11f9702babd94016122df1d328a459c8b7de2cb37a1bad947b1fe    398
38f6145c9a2b848dc1baa16fd91087e606b12bcb8757a9eb003dfab2c031fcaeb931c1ae6b486fab5f1c21037f33a187d1cb97080f4334a63f7ce0713d0f47b4    333
c81a672f30763423ddc7abac2b2e20a2f1ea045088ef77c48fd28823020204b955ebb5fa6cbc3b927c6ee044aaae2f09f174e1dac4195c8a378c14a2cc952093    307
78fb99d332dd664846f7934b7292dbb205641674541ff27091934949dc87ec67bc3c4f03ce935f50b045593d790b7aa5aa24b79f2d193d76d135a34c4150bb06    303
                                                                                                                                   ... 
1d3d71ae4ecc5c8243d1ab4c6bc69627c5ada2cbacaf423415c566d7c369476819008c9c3085b0c9b4601fb1aa2578c72cb5b9580e31fcdbfe100b4ce1884aae      1
176ae638bf4a32a363b4d407ddb76e7638ab800f


Value Counts for payment_type:


payment_type
Cash           61958
Credit Card    60969
Mobile         34734
Prcard         28680
Unknown         9576
No Charge        732
Dispute          141
Name: count, dtype: int64


Value Counts for company:


company
Flash Cab                               43905
Taxi Affiliation Services               32535
Taxicab Insurance Agency Llc            21244
Sun Taxi                                20155
City Service                            18710
Chicago Independents                    10606
5 Star Taxi                              9973
Blue Ribbon Taxi Association             8747
Globe Taxi                               7041
Tac - Yellow Cab Association             6964
Medallion Leasin                         4050
Taxicab Insurance Agency, LLC            2496
Choice Taxi Association                  2222
Choice Taxi Association Inc              1777
Chicago City Taxi Association            1773
Wolley Taxi                               952
Top Cab                                   493
Tac - Checker Cab Dispatch                402
Tac - American United Dispatch            365
Koam Taxi Association                     330
312 Medallion Management Corp             259
Star North Taxi Management


Value Counts for pickup_census_tract:


pickup_census_tract
nan            126784
17031980000     11392
17031839100      8569
17031320100      7804
17031281900      7321
                ...  
17031510300         1
17031843600         1
17031040201         1
17031420400         1
17031390600         1
Name: count, Length: 182, dtype: int64


Value Counts for dropoff_census_tract:


dropoff_census_tract
nan            128584
17031839100      9783
17031320100      7529
17031081500      4669
17031281900      4279
                ...  
17031841800         1
17031062200         1
17031030400         1
17031340500         1
17031560300         1
Name: count, Length: 250, dtype: int64

## Handle missing data

In [6]:
# Convert Int64 columns to float before filling missing values
int_columns = df.select_dtypes(include=['Int64']).columns
df[int_columns] = df[int_columns].astype(float)

# Fill missing values in numeric columns with the mean of each column
numeric_columns = df.select_dtypes(include=['number']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Display the DataFrame to verify the changes
display(df.head())

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,...,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
0,f3a9abf8f20738d79ead51dc52a152b869405079,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,2025-01-01,2025-01-01 00:15:00,601.0,5.14,76.0,10.0,15.25,3.11,...,5.0,23.86,Credit Card,Blue Ribbon Taxi Association,41.980264,-87.913625,41.985015,-87.804532,,
1,310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,2025-01-01,2025-01-01 00:00:00,340.0,1.45,28.0,28.0,20.0,4.1,...,0.0,24.6,Mobile,Sun Taxi,41.874005,-87.663518,41.874005,-87.663518,,
2,53fc5da5892235cce7b470dc94440e491eae9532,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,2025-01-01,2025-01-01 00:45:00,2886.0,18.28,76.0,32.0,51.5,5.6,...,4.0,61.6,Mobile,Medallion Leasin,41.980264,-87.913625,41.878866,-87.625192,,
3,65b4347a7a8525e779b1be2b3b93b39bb50ebeeb,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,2025-01-01,2025-01-01 00:00:00,795.0,15.09,33.091183,34.0,37.0,0.0,...,0.0,37.0,Cash,Sun Taxi,41.897107,-87.688013,41.842076,-87.633973,,
4,604c104c64de68af93a0f69ed52ba585f9f26427,d3b33673ee1a39325983e66d426003caf668f4509230cb...,2025-01-01,2025-01-01 00:30:00,1920.0,8.0,41.0,43.0,25.5,0.0,...,0.0,25.5,Unknown,Taxi Affiliation Services,41.79409,-87.592311,41.761578,-87.572782,,


### Check rows where the trip distance is zero (as that will affect downstream kpis)

In [7]:
zero_trip_miles_count = df[df['trip_miles'] == 0].shape[0]
print(f"Number of rows with trip_miles=0: {zero_trip_miles_count}")

Number of rows with trip_miles=0: 17111


### Drop trips that have no distance (anomolies)

In [8]:
original_row_count = df.shape[0]

df = df[df['trip_miles'] > 0].reset_index(drop=True)

# Calculate the number of rows dropped
rows_dropped = original_row_count - df.shape[0]
print(f"Number of rows dropped: {rows_dropped}")

# Display the DataFrame to verify the changes
display(df.head())

Number of rows dropped: 17111


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area,dropoff_community_area,fare,tips,...,extras,trip_total,payment_type,company,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,pickup_census_tract,dropoff_census_tract
0,f3a9abf8f20738d79ead51dc52a152b869405079,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,2025-01-01,2025-01-01 00:15:00,601.0,5.14,76.0,10.0,15.25,3.11,...,5.0,23.86,Credit Card,Blue Ribbon Taxi Association,41.980264,-87.913625,41.985015,-87.804532,,
1,310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,2025-01-01,2025-01-01 00:00:00,340.0,1.45,28.0,28.0,20.0,4.1,...,0.0,24.6,Mobile,Sun Taxi,41.874005,-87.663518,41.874005,-87.663518,,
2,53fc5da5892235cce7b470dc94440e491eae9532,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,2025-01-01,2025-01-01 00:45:00,2886.0,18.28,76.0,32.0,51.5,5.6,...,4.0,61.6,Mobile,Medallion Leasin,41.980264,-87.913625,41.878866,-87.625192,,
3,65b4347a7a8525e779b1be2b3b93b39bb50ebeeb,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,2025-01-01,2025-01-01 00:00:00,795.0,15.09,33.091183,34.0,37.0,0.0,...,0.0,37.0,Cash,Sun Taxi,41.897107,-87.688013,41.842076,-87.633973,,
4,604c104c64de68af93a0f69ed52ba585f9f26427,d3b33673ee1a39325983e66d426003caf668f4509230cb...,2025-01-01,2025-01-01 00:30:00,1920.0,8.0,41.0,43.0,25.5,0.0,...,0.0,25.5,Unknown,Taxi Affiliation Services,41.79409,-87.592311,41.761578,-87.572782,,


### Check again to confirm no anomolies

In [9]:
zero_trip_miles_count = df[df['trip_miles'] == 0].shape[0]
print(f"Number of rows with trip_miles=0: {zero_trip_miles_count}")

Number of rows with trip_miles=0: 0


## Create a normalized Data Model of dimensions

### Create a Dim for Time

In [10]:
# Extract start and end timestamps
start_times = df['trip_start_timestamp']
end_times = df['trip_end_timestamp']

# Combine start and end timestamps into a single series
all_times = pd.concat([start_times, end_times]).drop_duplicates().reset_index(drop=True)

# Create the time dimension dataframe
dim_time = pd.DataFrame({
    'fk_trip_timestamp': all_times,
    'date': all_times.dt.date,
    'time': all_times.dt.time,
    'day_of_week': all_times.dt.dayofweek,
    'month': all_times.dt.month,
    'hour': all_times.dt.hour,
    'minute': all_times.dt.minute
})

# Map day of week to abbreviated string
day_of_week_map = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
dim_time['day_of_week_str'] = dim_time['day_of_week'].map(day_of_week_map)

# Set fk_trip_timestamp as the index
dim_time.set_index('fk_trip_timestamp', inplace=True)

# Display the time dimension dataframe
display(dim_time.head())
dim_time.dtypes

Unnamed: 0_level_0,date,time,day_of_week,month,hour,minute,day_of_week_str
fk_trip_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-01-01 00:00:00,2025-01-01,00:00:00,2,1,0,0,Wed
2024-12-31 23:45:00,2024-12-31,23:45:00,1,12,23,45,Tue
2024-12-31 23:30:00,2024-12-31,23:30:00,1,12,23,30,Tue
2024-12-31 23:15:00,2024-12-31,23:15:00,1,12,23,15,Tue
2024-12-31 23:00:00,2024-12-31,23:00:00,1,12,23,0,Tue


date               object
time               object
day_of_week         int32
month               int32
hour                int32
minute              int32
day_of_week_str    object
dtype: object

### Create a Dim for Location Data

#### We can use both Lat and Long as foreign keys

In [11]:
# Create a new DataFrame for the location dimension
df['community_area'] = df.apply(lambda row: row['pickup_community_area'] if pd.notnull(row['pickup_community_area']) else row['dropoff_community_area'], axis=1).round().astype(int)
dim_location = pd.DataFrame()

# Combine pickup and dropoff locations into a single column
dim_location['fk_latitude'] = df.apply(lambda row: row['pickup_centroid_latitude'] if pd.notnull(row['pickup_centroid_latitude']) else row['dropoff_centroid_latitude'], axis=1)
dim_location['fk_longitude'] = df.apply(lambda row: row['pickup_centroid_longitude'] if pd.notnull(row['pickup_centroid_longitude']) else row['dropoff_centroid_longitude'], axis=1)

# Include community info
dim_location['community_area'] = df['community_area']

# Drop duplicates and reset index
dim_location = dim_location.drop_duplicates().reset_index(drop=True)

# Display the location dimension dataframe
display(dim_location.head())
dim_location.dtypes


Unnamed: 0,fk_latitude,fk_longitude,community_area
0,41.980264,-87.913625,76
1,41.874005,-87.663518,28
2,41.897107,-87.688013,33
3,41.79409,-87.592311,41
4,41.878866,-87.625192,32


fk_latitude       float64
fk_longitude      float64
community_area      int64
dtype: object

### Create a Dim for Payment Types

In [12]:
# Extract unique payment types
unique_payment_types = df['payment_type'].unique()

# Create the payment type dimension dataframe
dim_payment_type = pd.DataFrame({
    'payment_type': unique_payment_types,
    'payment_type_key': range(1, len(unique_payment_types) + 1)
})

# Display the payment type dimension dataframe
display(dim_payment_type)

Unnamed: 0,payment_type,payment_type_key
0,Credit Card,1
1,Mobile,2
2,Cash,3
3,Unknown,4
4,Prcard,5
5,Dispute,6
6,No Charge,7


### Create a Dim for Taxi Companies based on the Taxi Id

In [13]:
# Create the taxi dimension dataframe
dim_taxi = df[['taxi_id', 'company']].drop_duplicates().reset_index(drop=True)
dim_taxi.rename(columns={'taxi_id': 'fk_taxi_id'}, inplace=True)

# Display the taxi dimension dataframe
display(dim_taxi.head(10))


Unnamed: 0,fk_taxi_id,company
0,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,Blue Ribbon Taxi Association
1,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,Sun Taxi
2,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,Medallion Leasin
3,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,Sun Taxi
4,d3b33673ee1a39325983e66d426003caf668f4509230cb...,Taxi Affiliation Services
5,e7f8c9242fc38babca76de5c34b1e59b9b7ae3ff40812c...,Taxicab Insurance Agency Llc
6,11bb28dc5075f790bd4529d80a571002aeb69fd4145015...,City Service
7,f35fcc7d28fd3ca0324ad6b42c6bf1dd0413320a9c4b65...,Medallion Leasin
8,99b1203df31b26f1c372e57926d731514ae839dbb485f3...,Chicago City Taxi Association
9,9d91a7d7989a270ae46c41eaf82aebaa3540e19b13ef9c...,Wolley Taxi


### Create the Fact Table merging with Dim tables

#### The number of records is around 200K. This means the merge operations must be done in chunks, along with other mem optimizations

##### What happends if we increase the number of rows to 1M or 2M? (We will see later....)

In [14]:
# Ensure trip_start_timestamp is a datetime type
df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])
df['trip_end_timestamp'] = pd.to_datetime(df['trip_end_timestamp'])

# Select only necessary columns to reduce memory usage
df_reduced = df[['trip_id', 'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 
                 'pickup_centroid_latitude', 'pickup_centroid_longitude', 
                 'dropoff_centroid_latitude', 'dropoff_centroid_longitude', 
                 'trip_seconds', 'trip_miles', 'fare', 'tips', 'tolls', 'extras', 
                 'trip_total', 'payment_type', 'company', 'pickup_community_area', 'dropoff_community_area']].copy()

# Convert columns to appropriate types to reduce memory usage
df_reduced.loc[:, 'trip_id'] = df_reduced['trip_id'].astype('category')
df_reduced.loc[:, 'taxi_id'] = df_reduced['taxi_id'].astype('category')
df_reduced.loc[:, 'payment_type'] = df_reduced['payment_type'].astype('category')
df_reduced.loc[:, 'company'] = df_reduced['company'].astype('category')

# Downcast numeric columns to more memory-efficient types
df_reduced.loc[:, 'trip_seconds'] = pd.to_numeric(df_reduced['trip_seconds'], downcast='integer')
df_reduced.loc[:, 'trip_miles'] = pd.to_numeric(df_reduced['trip_miles'], downcast='float')
df_reduced.loc[:, 'fare'] = pd.to_numeric(df_reduced['fare'], downcast='float')
df_reduced.loc[:, 'tips'] = pd.to_numeric(df_reduced['tips'], downcast='float')
df_reduced.loc[:, 'tolls'] = pd.to_numeric(df_reduced['tolls'], downcast='float')
df_reduced.loc[:, 'extras'] = pd.to_numeric(df_reduced['extras'], downcast='float')
df_reduced.loc[:, 'trip_total'] = pd.to_numeric(df_reduced['trip_total'], downcast='float')
df_reduced.loc[:, 'pickup_centroid_latitude'] = pd.to_numeric(df_reduced['pickup_centroid_latitude'], downcast='float')
df_reduced.loc[:, 'pickup_centroid_longitude'] = pd.to_numeric(df_reduced['pickup_centroid_longitude'], downcast='float')
df_reduced.loc[:, 'dropoff_centroid_latitude'] = pd.to_numeric(df_reduced['dropoff_centroid_latitude'], downcast='float')
df_reduced.loc[:, 'dropoff_centroid_longitude'] = pd.to_numeric(df_reduced['dropoff_centroid_longitude'], downcast='float')

# Round and convert community areas to integers
df_reduced['pickup_community_area'] = df_reduced['pickup_community_area'].round().astype('Int64')
df_reduced['dropoff_community_area'] = df_reduced['dropoff_community_area'].round().astype('Int64')

# Add taxi_key to dim_taxi
dim_taxi['taxi_key'] = range(1, len(dim_taxi) + 1)

# Process data in chunks to avoid memory errors
chunk_size = 10000  # Reduce chunk size to avoid memory errors
chunks = []

for start in range(0, len(df_reduced), chunk_size):
    chunk = df_reduced.iloc[start:start + chunk_size].copy()

    # Merge chunk with dim_time to get the start time dimension keys
    chunk = chunk.merge(dim_time.reset_index(), left_on='trip_start_timestamp', right_on='fk_trip_timestamp', how='left', suffixes=('', '_start'))

    # Merge chunk with dim_time to get the end time dimension keys
    chunk = chunk.merge(dim_time.reset_index(), left_on='trip_end_timestamp', right_on='fk_trip_timestamp', how='left', suffixes=('', '_end'))

    # Merge with dim_location to get the pickup and dropoff location keys
    chunk = chunk.merge(dim_location.rename(columns={'fk_latitude': 'pickup_latitude', 'fk_longitude': 'pickup_longitude', 'community_area': 'pickup_community_area_key'}), left_on=['pickup_centroid_latitude', 'pickup_centroid_longitude'], right_on=['pickup_latitude', 'pickup_longitude'], how='left')
    chunk = chunk.merge(dim_location.rename(columns={'fk_latitude': 'dropoff_latitude', 'fk_longitude': 'dropoff_longitude', 'community_area': 'dropoff_community_area_key'}), left_on=['dropoff_centroid_latitude', 'dropoff_centroid_longitude'], right_on=['dropoff_latitude', 'dropoff_longitude'], how='left')

    # Merge with dim_taxi to get the taxi keys
    chunk = chunk.merge(dim_taxi.rename(columns={'fk_taxi_id': 'taxi_id'}), on=['taxi_id', 'company'], how='left')

    # Merge with dim_payment_type to get the payment type keys
    chunk = chunk.merge(dim_payment_type, on='payment_type', how='left')

    # Select and rename the relevant columns for the fact table
    chunk = chunk[['trip_id', 'taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 
                   'pickup_centroid_latitude', 'pickup_centroid_longitude', 
                   'dropoff_centroid_latitude', 'dropoff_centroid_longitude', 
                   'trip_seconds', 'trip_miles', 'fare', 'tips', 'tolls', 'extras', 
                   'trip_total', 'payment_type_key', 'pickup_community_area', 'dropoff_community_area']]

    chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
fact_trips = pd.concat(chunks, ignore_index=True)
# Display the total number of records in the fact table
total_rows_fact_table = fact_trips.shape[0]
print(f"Total number of records in the fact table: {total_rows_fact_table}")
# Display the fact table
display(fact_trips.head())

Total number of records in the fact table: 179679


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,payment_type_key,pickup_community_area,dropoff_community_area
0,f3a9abf8f20738d79ead51dc52a152b869405079,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,2025-01-01,2025-01-01 00:15:00,41.980263,-87.913628,41.985016,-87.804535,601.0,5.14,15.25,3.11,0.0,5.0,23.860001,1,76,10
1,310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,2025-01-01,2025-01-01 00:00:00,41.874004,-87.663521,41.874004,-87.663521,340.0,1.45,20.0,4.1,0.0,0.0,24.6,2,28,28
2,53fc5da5892235cce7b470dc94440e491eae9532,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,2025-01-01,2025-01-01 00:45:00,41.980263,-87.913628,41.878864,-87.625191,2886.0,18.280001,51.5,5.6,0.0,4.0,61.599998,2,76,32
3,65b4347a7a8525e779b1be2b3b93b39bb50ebeeb,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,2025-01-01,2025-01-01 00:00:00,41.897106,-87.688011,41.842075,-87.633972,795.0,15.09,37.0,0.0,0.0,0.0,37.0,3,33,34
4,604c104c64de68af93a0f69ed52ba585f9f26427,d3b33673ee1a39325983e66d426003caf668f4509230cb...,2025-01-01,2025-01-01 00:30:00,41.79409,-87.592308,41.761578,-87.572784,1920.0,8.0,25.5,0.0,0.0,0.0,25.5,4,41,43


#### Check all Tables

In [15]:
# Display all dimension tables
print("Time Dimension Table (Total Rows: {}):".format(len(dim_time)))
display(dim_time.head())

print("Location Dimension Table (Total Rows: {}):".format(len(dim_location)))
display(dim_location.head())

print("Payment Type Dimension Table (Total Rows: {}):".format(len(dim_payment_type)))
display(dim_payment_type.head())

print("Taxi Dimension Table (Total Rows: {}):".format(len(dim_taxi)))
display(dim_taxi.head())

# Display the final fact table
print("Fact Table (Total Rows: {}):".format(len(fact_trips)))
display(fact_trips.head())

Time Dimension Table (Total Rows: 1419):


Unnamed: 0_level_0,date,time,day_of_week,month,hour,minute,day_of_week_str
fk_trip_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-01-01 00:00:00,2025-01-01,00:00:00,2,1,0,0,Wed
2024-12-31 23:45:00,2024-12-31,23:45:00,1,12,23,45,Tue
2024-12-31 23:30:00,2024-12-31,23:30:00,1,12,23,30,Tue
2024-12-31 23:15:00,2024-12-31,23:15:00,1,12,23,15,Tue
2024-12-31 23:00:00,2024-12-31,23:00:00,1,12,23,0,Tue


Location Dimension Table (Total Rows: 226):


Unnamed: 0,fk_latitude,fk_longitude,community_area
0,41.980264,-87.913625,76
1,41.874005,-87.663518,28
2,41.897107,-87.688013,33
3,41.79409,-87.592311,41
4,41.878866,-87.625192,32


Payment Type Dimension Table (Total Rows: 7):


Unnamed: 0,payment_type,payment_type_key
0,Credit Card,1
1,Mobile,2
2,Cash,3
3,Unknown,4
4,Prcard,5


Taxi Dimension Table (Total Rows: 2781):


Unnamed: 0,fk_taxi_id,company,taxi_key
0,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,Blue Ribbon Taxi Association,1
1,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,Sun Taxi,2
2,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,Medallion Leasin,3
3,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,Sun Taxi,4
4,d3b33673ee1a39325983e66d426003caf668f4509230cb...,Taxi Affiliation Services,5


Fact Table (Total Rows: 179679):


Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,payment_type_key,pickup_community_area,dropoff_community_area
0,f3a9abf8f20738d79ead51dc52a152b869405079,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,2025-01-01,2025-01-01 00:15:00,41.980263,-87.913628,41.985016,-87.804535,601.0,5.14,15.25,3.11,0.0,5.0,23.860001,1,76,10
1,310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,2025-01-01,2025-01-01 00:00:00,41.874004,-87.663521,41.874004,-87.663521,340.0,1.45,20.0,4.1,0.0,0.0,24.6,2,28,28
2,53fc5da5892235cce7b470dc94440e491eae9532,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,2025-01-01,2025-01-01 00:45:00,41.980263,-87.913628,41.878864,-87.625191,2886.0,18.280001,51.5,5.6,0.0,4.0,61.599998,2,76,32
3,65b4347a7a8525e779b1be2b3b93b39bb50ebeeb,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,2025-01-01,2025-01-01 00:00:00,41.897106,-87.688011,41.842075,-87.633972,795.0,15.09,37.0,0.0,0.0,0.0,37.0,3,33,34
4,604c104c64de68af93a0f69ed52ba585f9f26427,d3b33673ee1a39325983e66d426003caf668f4509230cb...,2025-01-01,2025-01-01 00:30:00,41.79409,-87.592308,41.761578,-87.572784,1920.0,8.0,25.5,0.0,0.0,0.0,25.5,4,41,43


#### Validate num of rows in Fact

##### (Remember we dropped rows where Trip Miles==0)

In [16]:
total_rows_fact_table = fact_trips.shape[0]
print(f"Total number of rows in fact table: {total_rows_fact_table}")

Total number of rows in fact table: 179679


## Lets add some new metrics to the Fact Tables

In [17]:
# Calculate trip duration in minutes
fact_trips['trip_duration_minutes'] = fact_trips['trip_seconds'] / 60

# Ensure trip_seconds is never zero
fact_trips['trip_seconds'] = fact_trips['trip_seconds'].replace(0, 1)

# Categorize trip distance
bins = [0, 2, 5, 10, float('inf')]
labels = ['short', 'medium', 'long', 'very long']
fact_trips['trip_distance_category'] = pd.cut(fact_trips['trip_miles'], bins=bins, labels=labels)

# Calculate average speed (miles per hour)
fact_trips['average_speed_mph'] = fact_trips['trip_miles'] / (fact_trips['trip_seconds'] / 3600)

# Handle missing values
fact_trips.loc[:, 'fare'] = fact_trips['fare'].fillna(fact_trips['fare'].mean())
fact_trips.loc[:, 'tips'] = fact_trips['tips'].fillna(0)
fact_trips.loc[:, 'tolls'] = fact_trips['tolls'].fillna(0)
fact_trips.loc[:, 'extras'] = fact_trips['extras'].fillna(0)
fact_trips.loc[:, 'trip_total'] = fact_trips['trip_total'].fillna(fact_trips['fare'] + fact_trips['tips'] + fact_trips['tolls'] + fact_trips['extras'])

# Ensure no warnings for setting values on a copy of a slice
fact_trips = fact_trips.copy()

# Display the transformed fact table
display(fact_trips.head())

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,trip_seconds,trip_miles,...,tips,tolls,extras,trip_total,payment_type_key,pickup_community_area,dropoff_community_area,trip_duration_minutes,trip_distance_category,average_speed_mph
0,f3a9abf8f20738d79ead51dc52a152b869405079,ce8e5531edf9255d9ecb2f930e320f9c1c83229e437445...,2025-01-01,2025-01-01 00:15:00,41.980263,-87.913628,41.985016,-87.804535,601.0,5.14,...,3.11,0.0,5.0,23.860001,1,76,10,10.016667,long,30.788685
1,310e0c9d20fb19370f0b4a80aea9c64ce64ca3cd,018e2fc4d3e35a1f76f3221f8ecf824591aea4e6e7e05f...,2025-01-01,2025-01-01 00:00:00,41.874004,-87.663521,41.874004,-87.663521,340.0,1.45,...,4.1,0.0,0.0,24.6,2,28,28,5.666667,short,15.352942
2,53fc5da5892235cce7b470dc94440e491eae9532,9ac243b4bb4548e7214a248a491aa3ddddf391820e6c7b...,2025-01-01,2025-01-01 00:45:00,41.980263,-87.913628,41.878864,-87.625191,2886.0,18.280001,...,5.6,0.0,4.0,61.599998,2,76,32,48.1,very long,22.802496
3,65b4347a7a8525e779b1be2b3b93b39bb50ebeeb,2fea69c8a6e08471bc4339a05e9ee7955bef68d791f77a...,2025-01-01,2025-01-01 00:00:00,41.897106,-87.688011,41.842075,-87.633972,795.0,15.09,...,0.0,0.0,0.0,37.0,3,33,34,13.25,very long,68.332076
4,604c104c64de68af93a0f69ed52ba585f9f26427,d3b33673ee1a39325983e66d426003caf668f4509230cb...,2025-01-01,2025-01-01 00:30:00,41.79409,-87.592308,41.761578,-87.572784,1920.0,8.0,...,0.0,0.0,0.0,25.5,4,41,43,32.0,long,15.0


## Create a Grouped Dataset

In [18]:
# Group fact_trips by week, taxi_id, and community_area_id
grouped_fact_trips_by_wk = fact_trips.groupby([pd.Grouper(key='trip_start_timestamp', freq='W'), 'taxi_id', 'pickup_community_area']).agg({
    'trip_miles': ['mean', 'sum'],
    'fare': ['mean', 'sum'],
    'tips': ['mean', 'sum'],
    'tolls': ['mean', 'sum'],
    'extras': ['mean', 'sum'],
    'trip_total': ['mean', 'sum'],
    'trip_seconds': ['mean', 'sum'],
    'average_speed_mph': 'mean'
}).reset_index()

# Flatten the MultiIndex columns
grouped_fact_trips_by_wk.columns = ['week', 'taxi_id', 'community_area_id'] + ['_'.join(col).strip() for col in grouped_fact_trips_by_wk.columns.values[3:]]

# Split the 'week' column into 'from_date' and 'to_date'
grouped_fact_trips_by_wk['from_date'] = grouped_fact_trips_by_wk['week'].apply(lambda x: x - pd.offsets.Week(weekday=6)).dt.date
grouped_fact_trips_by_wk['to_date'] = grouped_fact_trips_by_wk['week'].apply(lambda x: x + pd.offsets.Week(weekday=6)).dt.date

# Convert 'from_date' and 'to_date' to datetime format
grouped_fact_trips_by_wk['from_date'] = pd.to_datetime(grouped_fact_trips_by_wk['from_date'])
grouped_fact_trips_by_wk['to_date'] = pd.to_datetime(grouped_fact_trips_by_wk['to_date'])

# Drop the original 'week' column
grouped_fact_trips_by_wk = grouped_fact_trips_by_wk.drop(columns=['week'])

# Set the new columns as the index
grouped_fact_trips_by_wk = grouped_fact_trips_by_wk.set_index(['from_date', 'to_date', 'taxi_id', 'community_area_id'])

# Convert all columns to appropriate data types
grouped_fact_trips_by_wk = grouped_fact_trips_by_wk.apply(pd.to_numeric, errors='coerce')

# Ensure no columns have object dtype
grouped_fact_trips_by_wk = grouped_fact_trips_by_wk.convert_dtypes()

# Display the updated grouped fact table
display(grouped_fact_trips_by_wk)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,trip_miles_mean,trip_miles_sum,fare_mean,fare_sum,tips_mean,tips_sum,tolls_mean,tolls_sum,extras_mean,extras_sum,trip_total_mean,trip_total_sum,trip_seconds_mean,trip_seconds_sum,average_speed_mph_mean
from_date,to_date,taxi_id,community_area_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2024-12-15,2024-12-29,0044e6c0d091476299b99345501f756b23632a96cbaf40e872fbf14f976410d3f938aacc643ed608b2aa42809222d4458e1aab5e0848e9b952f35616785c3a36,3,14.495,28.99,37.875,75.75,0.0,0.0,0.0,0.0,0.0,0.0,37.875,75.75,1656.5,3313.0,31.766051
2024-12-15,2024-12-29,0044e6c0d091476299b99345501f756b23632a96cbaf40e872fbf14f976410d3f938aacc643ed608b2aa42809222d4458e1aab5e0848e9b952f35616785c3a36,5,12.56,12.56,36.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,36.0,2025.0,2025.0,22.32889
2024-12-15,2024-12-29,0044e6c0d091476299b99345501f756b23632a96cbaf40e872fbf14f976410d3f938aacc643ed608b2aa42809222d4458e1aab5e0848e9b952f35616785c3a36,7,11.48,11.48,34.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,34.0,2587.0,2587.0,15.97526
2024-12-15,2024-12-29,0044e6c0d091476299b99345501f756b23632a96cbaf40e872fbf14f976410d3f938aacc643ed608b2aa42809222d4458e1aab5e0848e9b952f35616785c3a36,8,3.67625,29.41,13.40625,107.25,1.0725,8.58,0.0,0.0,0.0,0.0,15.29125,122.33,755.625,6045.0,13.303494
2024-12-15,2024-12-29,0044e6c0d091476299b99345501f756b23632a96cbaf40e872fbf14f976410d3f938aacc643ed608b2aa42809222d4458e1aab5e0848e9b952f35616785c3a36,28,7.823334,23.470001,22.583333,67.75,0.0,0.0,0.0,0.0,0.666667,2.0,23.25,69.75,1113.0,3339.0,23.307073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-29,2025-01-12,ffda53354c610fd3af1aee46d723028a49014e35f7280ca3d812716eb3cf1f0a763556368e4139b7183ee1fb36e26104de62bb86601f2fa16e3d96167e25b84e,3,10.62,10.62,28.75,28.75,0.0,0.0,0.0,0.0,0.0,0.0,28.75,28.75,1315.0,1315.0,29.073764
2024-12-29,2025-01-12,ffda53354c610fd3af1aee46d723028a49014e35f7280ca3d812716eb3cf1f0a763556368e4139b7183ee1fb36e26104de62bb86601f2fa16e3d96167e25b84e,4,1.285,2.57,6.75,13.5,0.0,0.0,0.0,0.0,0.0,0.0,6.75,13.5,379.5,759.0,9.495868
2024-12-29,2025-01-12,ffda53354c610fd3af1aee46d723028a49014e35f7280ca3d812716eb3cf1f0a763556368e4139b7183ee1fb36e26104de62bb86601f2fa16e3d96167e25b84e,6,4.8,14.4,14.666667,44.0,0.666667,2.0,0.0,0.0,0.0,0.0,15.666667,47.0,607.0,1821.0,20.049266
2024-12-29,2025-01-12,ffda53354c610fd3af1aee46d723028a49014e35f7280ca3d812716eb3cf1f0a763556368e4139b7183ee1fb36e26104de62bb86601f2fa16e3d96167e25b84e,8,17.83,17.83,43.75,43.75,0.0,0.0,0.0,0.0,0.0,0.0,43.75,43.75,1389.0,1389.0,46.211663


## Finally Save the Dataframes in Pickle Files so they can be referenced outside of this file

In [19]:
import pickle
import os

# Create the pickles directory if it does not exist
pickles_dir = '../../.pickles'
os.makedirs(pickles_dir, exist_ok=True)

# List of DataFrame names
df_names = ['dim_location', 'dim_payment_type', 'dim_taxi', 'dim_time', 'fact_trips', 'grouped_fact_trips_by_wk']

# Loop through the list and store each DataFrame
for name in df_names:
    with open(f"{pickles_dir}/{name}.pkl", 'wb') as f:
        pickle.dump(globals()[name], f)
