In [None]:
### Start running cells from here for the extra credit problems

import pyarrow as pa 
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds
from datetime import datetime

In [None]:
# Build a ref to the dataset.
dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"])


In [None]:
dataset.schema

In [None]:
# Print info for each fragment
for fragment in fragments:
    print(f"Partition: {fragment.partition_expression}")
    print(f"number of rows: {fragment.count_rows()}")


In [None]:
total_count = 0

for fragment in fragments:
    row_count = fragment.count_rows()
    total_count += row_count
    
    print(f"Partition {fragment.partition_expression}: {row_count} rows")

print(f"\nTotal number of rows: {total_count}")

*** Can you get the average transaction between 2:00-2:59 PM? ***

The idea here is to work smart, not hard. Instead of trying to handle all the data at once, we break it down into smaller, manageable pieces:

We don't process all fragments at the same time. We don't even work with all row groups (smaller chunks within fragments) at once. 

Instead, we take it one step at a time, processing just one row group at a time. This approach has two main benefits:

  * It's easier on your computer's memory. If a single row group is too big for your computer's RAM, you won't get stuck.
  * It works great if you have multiple computers or processors. Each one can handle a different row group, speeding things up.
  
  * So, we can figure out how to compute the value on a single row group, and then iterate over all row groups.

In [None]:
first_frag = next(dataset.get_fragments())

row_group_0 = first_frag.split_by_row_group()[0]

# conver that single row group to a a table while making sure we only read only the relveant columns
columns = ['pickup_at', 'total_amount']
row_group_0_table = row_group_0.to_table(columns=columns)

# Check the number of rows to make sure we've only read a single row group with two columns.
# We can do any computaiton we desire on this row group

row_group_0_table.shape

In [None]:
row_group_0_table

In [None]:
import pyarrow as pa
from datetime import datetime, time

pickup_at_column = row_group_0_table.column("pickup_at")
total_amount_column = row_group_0_table.column("total_amount")

total_amount = 0
count = 0

# We can covert the pick up field to a python time, which makes it easy to work with
for i in range(len(pickup_at_column)):
    timestamp = pickup_at_column[i].as_py()
    transaction_time = timestamp.time()
    if time(14, 0) <= transaction_time <= time(14, 59):
        total_amount += total_amount_column[i].as_py()
        count += 1

# Calculate average transaction amount
average_transaction_amount = total_amount / count

In [None]:
average_transaction_amount

In [None]:
# you can iterate over all fragments using:
for i, row_group in enumerate(first_frag.split_by_row_group()):
    print(f"processing row group {i}")
    

In [None]:
# And naturally, to work on fragments, you can use
for i, frag in enumerate(dataset.get_fragments()):
    print(f"processing frag {i}")
    