In [None]:
import dlt
from dlt.sources.helpers.rest_client import RESTClient
from dlt.sources.helpers.rest_client.paginators import PageNumberPaginator

### Extract Data

In [None]:
# https://us-central1-dlthub-analytics.cloudfunctions.net/data_engineering_zoomcamp_api

def paginated_getter():
    client = RESTClient(
        base_url="https://us-central1-dlthub-analytics.cloudfunctions.net", 

        paginator=PageNumberPaginator(
            base_page=1, 
            total_path=None
        )
    )

    for page in client.paginate("data_engineering_zoomcamp_api"): # API endpoint
        yield page

for page_data in paginated_getter():
    print(page_data)
    

### Normalize Data

In [None]:
data = [
    {
        "vendor_name": "VTS",
        "record_hash": "b00361a396177a9cb410ff61f20015ad",
        "time": {
            "pickup": "2009-06-14 23:23:00",
            "dropoff": "2009-06-14 23:48:00"
        },
        "coordinates": {
            "start": {"lon": -73.787442, "lat": 40.641525},
            "end": {"lon": -73.980072, "lat": 40.742963}
        },
        "passengers": [
            {"name": "John", "rating": 4.9},
            {"name": "Jack", "rating": 3.9}
        ]
    }
]

In [None]:
pipeline = dlt.pipeline(
    pipeline_name="ny_taxi_data", 
    destination="duckdb", 
    dataset_name="taxi_rides"
)

load_info = pipeline.run(data, table_name="rides", write_disposition="replace")
print(load_info)

In [None]:
print(pipeline.last_trace)

In [None]:
# Automatically detects schema 
# Flattens nested JSON
# Handles data type conversion 
# Splits lists into child tables
# Schema evolution support 

pipeline.dataset(dataset_type="default").rides.df().columns

Index(['vendor_name', 'record_hash', 'time__pickup', 'time__dropoff',
       'coordinates__start__lon', 'coordinates__start__lat',
       'coordinates__end__lon', 'coordinates__end__lat', '_dlt_load_id',
       '_dlt_id'],
      dtype='object')

In [None]:
# Timestamps were converted to the correct format
pipeline.dataset(dataset_type="default").rides.df()

Unnamed: 0,vendor_name,record_hash,time__pickup,time__dropoff,coordinates__start__lon,coordinates__start__lat,coordinates__end__lon,coordinates__end__lat,_dlt_load_id,_dlt_id
0,VTS,b00361a396177a9cb410ff61f20015ad,2009-06-14 23:23:00+00:00,2009-06-14 23:48:00+00:00,-73.787442,40.641525,-73.980072,40.742963,1739688856.5638936,xGMMf6/jC+73lg


In [10]:
# Splits lists into child tables
pipeline.dataset(dataset_type="default").rides__passengers.df()

Unnamed: 0,name,rating,_dlt_parent_id,_dlt_list_idx,_dlt_id
0,John,4.9,xGMMf6/jC+73lg,0,QI7k866HX/dEOA
1,Jack,3.9,xGMMf6/jC+73lg,1,63PqhTIUBS2oMw
