In [2]:
### Start running cells from here for the extra credit problems

import pyarrow as pa 
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
from datetime import datetime

In [3]:
# Build a ref to the dataset.
dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"])


In [4]:
dataset.schema

vendor_id: string
pickup_at: timestamp[us]
dropoff_at: timestamp[us]
passenger_count: int8
trip_distance: float
pickup_longitude: float
pickup_latitude: float
rate_code_id: null
store_and_fwd_flag: string
dropoff_longitude: float
dropoff_latitude: float
payment_type: string
fare_amount: float
extra: float
mta_tax: float
tip_amount: float
tolls_amount: float
total_amount: float
year: int32
month: int32
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 2527

In [11]:
# Print info for each fragment
for fragment in fragments:
    print(f"Partition: {fragment.partition_expression}")
    print(f"number of rows: {fragment.count_rows()}")


Partition: ((year == 2009) and (month == 1))
number of rows: 14092413
Partition: ((year == 2009) and (month == 2))
number of rows: 13380122
Partition: ((year == 2009) and (month == 3))
number of rows: 14387371
Partition: ((year == 2009) and (month == 4))
number of rows: 14294783
Partition: ((year == 2009) and (month == 5))
number of rows: 14796313
Partition: ((year == 2009) and (month == 6))
number of rows: 14184249
Partition: ((year == 2009) and (month == 7))
number of rows: 13626103
Partition: ((year == 2009) and (month == 8))
number of rows: 13686520
Partition: ((year == 2009) and (month == 9))
number of rows: 13984887
Partition: ((year == 2009) and (month == 10))
number of rows: 15604551
Partition: ((year == 2009) and (month == 11))
number of rows: 14275339
Partition: ((year == 2009) and (month == 12))
number of rows: 14583404
Partition: ((year == 2010) and (month == 1))
number of rows: 14863778
Partition: ((year == 2010) and (month == 2))
number of rows: 100000
Partition: ((year =

number of rows: 8145164
Partition: ((year == 2018) and (month == 12))
number of rows: 8173231
Partition: ((year == 2019) and (month == 1))
number of rows: 7667792
Partition: ((year == 2019) and (month == 2))
number of rows: 7019375
Partition: ((year == 2019) and (month == 3))
number of rows: 7832545
Partition: ((year == 2019) and (month == 4))
number of rows: 7433139
Partition: ((year == 2019) and (month == 5))
number of rows: 7565261
Partition: ((year == 2019) and (month == 6))
number of rows: 6941024


In [13]:
total_count = 0

for fragment in fragments:
    row_count = fragment.count_rows()
    total_count += row_count
    
    print(f"Partition {fragment.partition_expression}: {row_count} rows")

print(f"\nTotal number of rows: {total_count}")

Partition ((year == 2009) and (month == 1)): 14092413 rows
Partition ((year == 2009) and (month == 2)): 13380122 rows
Partition ((year == 2009) and (month == 3)): 14387371 rows
Partition ((year == 2009) and (month == 4)): 14294783 rows
Partition ((year == 2009) and (month == 5)): 14796313 rows
Partition ((year == 2009) and (month == 6)): 14184249 rows
Partition ((year == 2009) and (month == 7)): 13626103 rows
Partition ((year == 2009) and (month == 8)): 13686520 rows
Partition ((year == 2009) and (month == 9)): 13984887 rows
Partition ((year == 2009) and (month == 10)): 15604551 rows
Partition ((year == 2009) and (month == 11)): 14275339 rows
Partition ((year == 2009) and (month == 12)): 14583404 rows
Partition ((year == 2010) and (month == 1)): 14863778 rows
Partition ((year == 2010) and (month == 2)): 100000 rows
Partition ((year == 2010) and (month == 4)): 15144990 rows
Partition ((year == 2010) and (month == 5)): 15481351 rows
Partition ((year == 2010) and (month == 6)): 14825128 r

*** Can you get the average transaction between 2:00-2:59 PM? ***

The idea here is to work smart, not hard. Instead of trying to handle all the data at once, we break it down into smaller, manageable pieces:

We don't process all fragments at the same time. We don't even work with all row groups (smaller chunks within fragments) at once. 

Instead, we take it one step at a time, processing just one row group at a time. This approach has two main benefits:

  * It's easier on your computer's memory. If a single row group is too big for your computer's RAM, you won't get stuck.
  * It works great if you have multiple computers or processors. Each one can handle a different row group, speeding things up.
  
  * So, we can figure out how to compute the value on a single row group, and then iterate over all row groups.

In [18]:
first_frag = next(dataset.get_fragments())

row_group_0 = first_frag.split_by_row_group()[0]

# conver that single row group to a a table while making sure we only read only the relveant columns
columns = ['pickup_at', 'total_amount']
row_group_0_table = row_group_0.to_table(columns=columns)

# Check the number of rows to make sure we've only read a single row group with two columns.
# We can do any computaiton we desire on this row group

row_group_0_table.shape

(65536, 2)

In [19]:
row_group_0_table

pyarrow.Table
pickup_at: timestamp[us]
total_amount: float
----
pickup_at: [[2009-01-04 02:52:00.000000,2009-01-04 03:31:00.000000,2009-01-03 15:43:00.000000,2009-01-01 20:52:58.000000,2009-01-24 16:18:23.000000,...,2009-01-01 22:42:49.000000,2009-01-04 18:27:32.000000,2009-01-04 11:48:33.000000,2009-01-04 23:21:04.000000,2009-01-04 16:11:27.000000]]
total_amount: [[9.4,14.6,28.44,18.45,3.7,...,13.28,5.9,4.9,5.4,8.5]]

In [7]:
import pyarrow as pa
from datetime import datetime, time

pickup_at_column = row_group_0_table.column("pickup_at")
total_amount_column = row_group_0_table.column("total_amount")

total_amount = 0
count = 0

# We can covert the pick up field to a python time, which makes it easy to work with
for i in range(len(pickup_at_column)):
    timestamp = pickup_at_column[i].as_py()
    transaction_time = timestamp.time()
    if time(14, 0) <= transaction_time <= time(14, 59):
        total_amount += total_amount_column[i].as_py()
        count += 1

# Calculate average transaction amount
average_transaction_amount = total_amount / count

In [8]:
average_transaction_amount

10.325556583854983

In [9]:
# you can iterate over all fragments using:
for i, row_group in enumerate(first_frag.split_by_row_group()):
    print(f"processing row group {i}")
    

processing row group 0
processing row group 1
processing row group 2
processing row group 3
processing row group 4
processing row group 5
processing row group 6
processing row group 7
processing row group 8
processing row group 9
processing row group 10
processing row group 11
processing row group 12
processing row group 13
processing row group 14
processing row group 15
processing row group 16
processing row group 17
processing row group 18
processing row group 19
processing row group 20
processing row group 21
processing row group 22
processing row group 23
processing row group 24
processing row group 25
processing row group 26
processing row group 27
processing row group 28
processing row group 29
processing row group 30
processing row group 31
processing row group 32
processing row group 33
processing row group 34
processing row group 35
processing row group 36
processing row group 37
processing row group 38
processing row group 39
processing row group 40
processing row group 41
pr

In [10]:
# And naturally, to work on fragments, you can use
for i, frag in enumerate(dataset.get_fragments()):
    print(f"processing frag {i}")
    

processing frag 0
processing frag 1
processing frag 2
processing frag 3
processing frag 4
processing frag 5
processing frag 6
processing frag 7
processing frag 8
processing frag 9
processing frag 10
processing frag 11
processing frag 12
processing frag 13
processing frag 14
processing frag 15
processing frag 16
processing frag 17
processing frag 18
processing frag 19
processing frag 20
processing frag 21
processing frag 22
processing frag 23
processing frag 24
processing frag 25
processing frag 26
processing frag 27
processing frag 28
processing frag 29
processing frag 30
processing frag 31
processing frag 32
processing frag 33
processing frag 34
processing frag 35
processing frag 36
processing frag 37
processing frag 38
processing frag 39
processing frag 40
processing frag 41
processing frag 42
processing frag 43
processing frag 44
processing frag 45
processing frag 46
processing frag 47
processing frag 48
processing frag 49
processing frag 50
processing frag 51
processing frag 52
pro