<h1>Chapter 1: Schema & Lineage Objects</h1>

<p>
Source and staging are pre-defined schemas that represent the first two stages of the data ingestion pipeline.
They are defined as follows: 
</p>

<span>Source - Raw, unprocessed source data pulled from source files</span>
<ul> <code>SELECT * FROM {source_file}</code> </ul>

<span>Staging - Processed data from source with column metadata information applied</span>
<ul><code>SELECT TRY_CAST({column} AS {column_metadata.data_type}) AS {column} FROM source.{table}</code></ul>

In [1]:
import tyr

# Required to get relative path of test datasets
import os

# Required to illustrate how metadata is used
import pandas as pd

from pprint import pprint

# About The Data

The data used in this tutorial is from the 2023 Formula 1 Singapore Grand Prix, courtesy of FastF1 and the OpenF1 api.
This data is useful as it provides multiple tables connected by common ids, and numeric data with which we can calculate
various metrics (averages, std_dev, etc.).

Thank you to both of these projects for providing such a fantastic data source. If you want to check them out, they
can be found at the following addresses:

- https://github.com/theOehrly/Fast-F1
- https://openf1.org/

In [2]:
# This is the full process to read in file metadata. There is a function to do this in source called read_file_metadata

file_metadata = pd.read_csv(
    os.path.abspath(
        os.path.join(os.getcwd(), "..", "tests/configurations/file_metadata.tsv")
    ),
    sep="\t",
)

display(file_metadata)

file_metadata = {
    row.dataset: tyr.lineage.schema.source.FileMetadata(row)
    for index, row in file_metadata.iterrows()
}

file_metadata

Unnamed: 0,schema,dataset,file_regex,delim,distinct
0,staging,car_location,/home/miles/tyr/tests/datasets/car_location_se...,t,True
1,staging,car_telemetry,/home/miles/tyr/tests/datasets/car_telemetry_s...,t,True
2,staging,circuits,/home/miles/tyr/tests/datasets/f1_circuits.geo...,geojson,True
3,staging,meetings,/home/miles/tyr/tests/datasets/meetings.tsv,t,True
4,staging,race_control,/home/miles/tyr/tests/datasets/race_control_se...,t,True
5,staging,session_status,/home/miles/tyr/tests/datasets/session_status_...,t,True
6,staging,track_status,/home/miles/tyr/tests/datasets/track_status_se...,t,True
7,staging,sessions,/home/miles/tyr/tests/datasets/sessions.tsv,t,True
8,staging,weather,/home/miles/tyr/tests/datasets/weather_session...,t,True
9,staging,results,/home/miles/tyr/tests/datasets/results_session...,t,True


{'car_location': <tyr.lineage.schema.source.FileMetadata at 0x7f327b692340>,
 'car_telemetry': <tyr.lineage.schema.source.FileMetadata at 0x7f327beefcd0>,
 'circuits': <tyr.lineage.schema.source.FileMetadata at 0x7f327b6924c0>,
 'meetings': <tyr.lineage.schema.source.FileMetadata at 0x7f327b692430>,
 'race_control': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778c70>,
 'session_status': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778d00>,
 'track_status': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778dc0>,
 'sessions': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778a90>,
 'weather': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778af0>,
 'results': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778d30>,
 'team_radio': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778f70>,
 'pit_stops': <tyr.lineage.schema.source.FileMetadata at 0x7f323b778ac0>,
 'position': <tyr.lineage.schema.source.FileMetadata at 0x7f323b7331c0>,
 'intervals': <tyr.lineage.

In [3]:
# This is the full process to read in column metadata. There is a function to do this in source called read_column_metadata
column_metadata = pd.read_csv(
    os.path.abspath(
        os.path.join(os.getcwd(), "..", "tests/configurations/column_metadata.tsv")
    ),
    sep="\t",
)

column_metadata["ordinal_position"] = column_metadata["ordinal_position"].astype(int)

column_metadata["is_primary_key"] = column_metadata["is_primary_key"].astype(bool)

column_metadata["is_event_time"] = column_metadata["is_event_time"].astype(bool)

column_metadata["filter_values"] = column_metadata["filter_values"].fillna("[]")

for column in [
    column
    for column in column_metadata.columns.tolist()
    if column
    not in ["ordinal_position", "is_primary_key", "is_event_time", "filter_values"]
]:
    column_metadata[column] = column_metadata[column].fillna("")
    column_metadata[column] = column_metadata[column].astype(str)

display(column_metadata)

column_metadata = {
    dataset: {
        row.column_name: tyr.lineage.schema.source.ColumnMetadata(row)
        for index, row in column_metadata[
            column_metadata["dataset"] == dataset
        ].iterrows()
    }
    for dataset in column_metadata.dataset.unique()
}

column_metadata

Unnamed: 0,schema,dataset,column_name,column_alias,var_type,data_type,on_null,is_primary_key,is_event_time,filter_values,on_filter,regex,source_unit,target_unit,scale_factor,precision,ordinal_position
0,staging,car_location,Date,event_ts,timestamp,TIMESTAMP,PASS,True,True,[],PASS,%Y-%m-%d %H:%M:%S.%g,,,,,0
1,staging,car_location,Status,status,categorical,VARCHAR,PASS,False,False,[],PASS,,,,,,2
2,staging,car_location,X,x,numeric,"DECIMAL(15,5)",PASS,False,False,[],PASS,,m^1,,,1dp,3
3,staging,car_location,Y,y,numeric,"DECIMAL(15,5)",PASS,False,False,[],PASS,,m^1,,,1dp,4
4,staging,car_location,Z,z,numeric,"DECIMAL(15,5)",PASS,False,False,[],PASS,,m^1,,,1dp,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,staging,intervals,date,event_ts,timestamp,TIMESTAMP,PASS,True,True,[],PASS,,,,,,0
105,staging,intervals,session_key,,key,INTEGER,PASS,True,False,[],PASS,,,,,,1
106,staging,intervals,driver_number,,key,INTEGER,PASS,True,False,[],PASS,,,,,,2
107,staging,intervals,gap_to_leader,,numeric,FLOAT,PASS,False,False,[],PASS,,s^1,,,,3


{'car_location': {'Date': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6d54f0>,
  'Status': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6d5520>,
  'X': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6d5dc0>,
  'Y': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6de3a0>,
  'Z': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6dea60>,
  'Source': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6edc70>,
  'Time': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6edd90>,
  'SessionTime': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6eda90>,
  'DriverNumber': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6e79d0>},
 'car_telemetry': {'Date': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6e7dc0>,
  'RPM': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6e7100>,
  'Speed': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6f3850>,
  'nGear': <tyr.lineage.schema.source.ColumnMetadata at 0x7f323b6f38e0>,
  'Throttle': <tyr.

_________________________________________________________________________________________________________________
# Editing Metadata

We're going to build source to illustrate how column metadata affects the construction of a staging table.

**This is bad practice.** 

Always update column and file metadata in the configuration files and rebuild from
the configuration files. We're only doing this to demonstrate how changing column metadata
changes the staging query.

In [4]:
source = tyr.lineage.schema.source.Source(
    settings=tyr.lineage.schema.source.SourceSettings(
        file_metadata=file_metadata,
        expected_column_metadata=column_metadata,
        extensions=[{'name':"spatial", 'origin':'duckdb'}],
    )
)

# Staging columns are built using a function called lineage.macros.columns.staging_column_transform()
# Here, we will call it on a single source column.

car_location_subquery = tyr.lineage.tables.Subquery(source.tables.car_location)

source_table = tyr.lineage.tables.Core(
    name="test_column_metadata_change",
    source=car_location_subquery,
    columns=tyr.lineage.core.ColumnList(
        [
            tyr.lineage.macros.columns.staging_column_transform(
                tyr.lineage.tables.Select(
                    source.tables.car_location
                ).columns.list_columns_()[0],
                source.tables.car_location.expected_column_metadata["X"],
            )
        ]
    ),
)

print("Query with 1dp precision")
print(source_table.sql)

# Let's update the precision of X from 1dp to 0dp
source.tables.car_location.expected_column_metadata["X"].precision = "0dp"

# Any object downstream relying on this information must be reinitialized
source_table = tyr.lineage.tables.Core(
    name="test_column_metadata_change",
    source=car_location_subquery,
    columns=tyr.lineage.core.ColumnList(
        [
            tyr.lineage.macros.columns.staging_column_transform(
                tyr.lineage.tables.Select(
                    source.tables.car_location
                ).columns.list_columns_()[0],
                source.tables.car_location.expected_column_metadata["X"],
            )
        ]
    ),
)

print("\n")
print("Query with 0dp precision")
print(source_table.sql)

# Note that re-running this cell without the following line does not reset the column precision to 0dp
# This is because the column metadata has not been re-read from file
source.tables.car_location.expected_column_metadata["X"].precision = "1dp"

KeyError: 'name'

Experiment with changing parameters in the configuration files:
- <code>tyr/test/configurations/column_metadata.tsv</code>
- <code>tyr/test/configurations/file_metadata.tsv</code>

A full list of parameters can be queried in the ColumnMetadata object docs



In [None]:
print(tyr.lineage.schema.source.ColumnMetadata.__doc__)

________________________________________________________________________________________________________________
# Building Source & Staging

<p>Now that we have our metadata objects read in, we can construct source and staging. It is importand to note that, although source takes <code>expected_column_metadata</code> as an argument, this isn't used in the construction of the sql query. It is stored within the source table object and used downstream by staging. The <code>SourceSettings</code> and <code>StagingSettings</code> objects are required for their respective schemas to read in the information correctly. The <code>extensions</code> parameter is used to install any necessary duckdb extensions. In this case, <code>spatial</code> will be required as geometry objects exist in the files.</p>

In [None]:
source = tyr.lineage.schema.source.Source(
    settings=tyr.lineage.schema.source.SourceSettings(
        file_metadata=file_metadata,
        expected_column_metadata=column_metadata,
        extensions=[{'name':"spatial", 'origin':'duckdb'}],
    )
)

staging = tyr.lineage.schema.staging.Staging(
    source=source,
    settings=tyr.lineage.schema.staging.StagingSettings(
        name="staging", extensions=[{'name':"spatial", 'origin':'duckdb'}]
    ),
)

_________________________________________________________________________________________________________
# Saving schemas

We can save schemas to recall them in other files without re-reading metadata. 
Note that this will mean that changing the metadata files will not update the schema. 
Only recreating them from the files will do this.

This practice is useful for version controlling schemas.

In [None]:
# Note that after this operation, the saved schema will appear in the tests/saved_schema/ directory
source.save(os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema")))
staging.save(os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema")))

source = tyr.lineage.schema.core.load_schema_from_pkl(
    os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema/source.pkl"))
)
staging = tyr.lineage.schema.core.load_schema_from_pkl(
    os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema/staging.pkl"))
)

___________________________________________________________________________________________________________________________________________________
# Interacting with lineage objects
<p>Now we can interact with the table objects and retrieve their attributes as follows: </p>

In [None]:
# Retrieve name
print("\n")
print(rf"Table name: {staging.tables.sessions.name}")

# Retrieve column names
print("\n")
print("Column Names:")
print(staging.tables.sessions.columns.list_names_())

# Column attributes can be achieved through attribute name or string indexing
print("\n")
print("Column from attribute name:")
print(staging.tables.sessions.columns.start_date.__dict__)

print("\n")
print("Column from string index:")
print(staging.tables.sessions.columns["start_date"].__dict__)

# Retrieve primary key
print("\n")
print("Primary Key:")
print(staging.tables.sessions.primary_key.list_names_())

# Retrieve query
print("\n")
print("SQL:")
print(staging.tables.sessions.sql)

# Retrieve schema of table
print("\n")
print("Schema name:")
print(staging.tables.sessions.schema.name)

# Retrieve name of source table with schema
print("\n")
print("Source table name:")
print(
    rf"{staging.tables.sessions.source.schema.name}.{staging.tables.sessions.source.name}"
)

______________________________________________________________________________________________________________________________________________________
# Retreiving attributes to root
Attributes can be chained until no root attribute exists

In [None]:
# Schema
print(staging)
print(staging.name)

In [None]:
# Schema tables
print(staging.tables)
print(staging.tables.list_names_())

In [None]:
# car_telemetry table
print(staging.tables.car_telemetry)
# TableList can also be accessed with string index
print(staging.tables["car_telemetry"].name)

In [None]:
# car_telemetry columns
print(staging.tables.car_telemetry.columns)
print(staging.tables.car_telemetry.columns.list_names_())

In [None]:
# kmh column in car_telemetry
print(staging.tables.car_telemetry.columns.kmh)
# kmh column name
print(staging.tables.car_telemetry.columns.kmh.name)
# check attributes of kmh
print(rf"var_type: {staging.tables.car_telemetry.columns.kmh.var_type}")
print(rf"data_type: {staging.tables.car_telemetry.columns.kmh.data_type.value}")
print(rf"unit: {staging.tables.car_telemetry.columns.kmh.unit.name}")
print(rf"sql: {staging.tables.car_telemetry.columns.kmh.sql}")

In [None]:
# kmh column source in car_telemetry - math ROUND function
print(staging.tables.car_telemetry.columns.kmh.source)
# At any point we can query sql. This will display sql down to the root level
print(staging.tables.car_telemetry.columns.kmh.source.sql)

In [None]:
# math ROUND function arg[0] - data_type TRYCAST function
print(staging.tables.car_telemetry.columns.kmh.source.args[0])
print(staging.tables.car_telemetry.columns.kmh.source.args[0].sql)

In [None]:
# data_type TRYCAST function arg[0] - expression AS
print(staging.tables.car_telemetry.columns.kmh.source.args[0].args[0])
print(staging.tables.car_telemetry.columns.kmh.source.args[0].args[0].sql)

In [None]:
# expression AS left object - utility function SourceWildToStaging <- This initializes the lineage tree for column objects
print(staging.tables.car_telemetry.columns.kmh.source.args[0].args[0].left)
# We can check which attributes are available to query using the .__dict__ method
pprint(staging.tables.car_telemetry.columns.kmh.source.args[0].args[0].left.__dict__)

In [None]:
# Get WildCard column from SourceWildToStaging
print(staging.tables.car_telemetry.columns.kmh.source.args[0].args[0].left.args[0])

# We can check the current table of this object to determine the source table
print(
    "Source table: "
    + staging.tables.car_telemetry.columns.kmh.source.args[0]
    .args[0]
    .left.args[0]
    .current_table.schema.name
    + "."
    + staging.tables.car_telemetry.columns.kmh.source.args[0]
    .args[0]
    .left.args[0]
    .current_table.name
)

In [None]:
# The WildCard value is the root object and therefore has no source
print(
    staging.tables.car_telemetry.columns.kmh.source.args[0]
    .args[0]
    .left.args[0]
    .source.__dict__
)