In [1]:
import pandas as pd

import tyr

# Required to get relative path of test datasets
import os

from pprint import pprint

pd.set_option('display.max_rows', None)

In [4]:
# Load schema from pkl
source = tyr.lineage.schema.core.load_schema_from_pkl(
    os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema/source.pkl"))
)
staging = tyr.lineage.schema.core.load_schema_from_pkl(
    os.path.abspath(os.path.join(os.getcwd(), "...", "tests/saved_schema/staging.pkl"))
)

FileNotFoundError: [Errno 2] No such file or directory: '/home/miles/tyr/Tutorials/.ipynb_checkpoints/tests/saved_schema/source.pkl'

In [None]:
# Defining connection
conn = tyr.database.connections.Connection(
    name="test",
    syntax="duckdb",
    database=os.path.abspath(os.path.join(os.getcwd(), "..", "tests/test.duckdb")),
    read_only=False,
)

In [None]:
# Confirm that the tables from 1.2 still exist in the database
display(conn.tables())

In [None]:
source.tables.car_telemetry.expected_column_metadata

In [None]:
# Lets do some examples with the car_telemetry table with a new table called car_telemetry_analysis
# As an example, we will retrieve the average speed in kmh, for each gear, for each available driver

# Best practice is to define the source as a separate object, as most attributes will rely on it
# and it allows ease of re-use of the Select object
car_telemetry_analysis_source = tyr.lineage.tables.Select(staging.tables.car_telemetry)

# Create a new core table
car_telemetry_analysis = tyr.lineage.tables.Core(
    # Give it a name. Best practice is to use the same name as the variable
    name="car_telemetry_analysis",
    # The source will be our previously defined source object
    source=car_telemetry_analysis_source,
    # We want to get the average speed over the static primary key of the car_telemetry table
    # We will first retrieve the static primary key using a macro
    columns=tyr.lineage.macros.columns.select_static_primary_key(
        car_telemetry_analysis_source
    )
    + tyr.lineage.core.ColumnList(
        [
            # We also want to perform this over the n_gear column as well. It is not a part of the static primary key
            # so we will have to bring it in separately
            tyr.lineage.columns.Core(
                name=car_telemetry_analysis_source.columns.n_gear.name,
                source=tyr.lineage.columns.Select(
                    car_telemetry_analysis_source.columns.n_gear
                ),
            ),
            # Finally, we require the average_kmh. We can achieve this using the aggregate.Average function
            tyr.lineage.columns.Core(
                name="average_kmh",
                source=tyr.lineage.functions.aggregate.Average(
                    car_telemetry_analysis_source.columns.kmh
                ),
            ),
        ]
    ),
    # We need to assign the primary key to the table. This is achieved by copying the static primary key columns
    # in the column attribute and the n_gear column as we want to partition over n_gear as well.
    primary_key=tyr.lineage.macros.columns.select_static_primary_key(
        car_telemetry_analysis_source
    )
    + tyr.lineage.core.ColumnList(
        [
            tyr.lineage.columns.Core(
                name=car_telemetry_analysis_source.columns.n_gear.name,
                source=tyr.lineage.columns.Select(
                    car_telemetry_analysis_source.columns.n_gear
                ),
            ),
        ]
    ),
    # Finally, to let the table know to group by the primary key, we need to set group_by=True
    group_by=True,
)

# Execute the query and check the output to ensure it seems correct
display(conn.execute(car_telemetry_analysis.sql).df())

In [None]:
# This is great, but we don't know who these drivers are. There is more information in the results table
display(conn.execute(staging.tables.results.sql).df())

In [None]:
# Let's use a join object to combine our metrics and the results table
# We will also require use of the ctes parameter of the table Core object

left = tyr.lineage.tables.Select(car_telemetry_analysis)
right = tyr.lineage.tables.Select(staging.tables.results)

analysis_results_join = tyr.lineage.joins.Join(
    join_expression=tyr.lineage.expressions.LeftJoin(left=left, right=right),
    condition=tyr.lineage.core.Condition(
        checks=[
            tyr.lineage.expressions.Equal(
                tyr.lineage.columns.Select(left.columns.driver_number),
                tyr.lineage.columns.Select(right.columns.driver_number),
            )
        ]
    ),
)

joined_analysis_results = tyr.lineage.tables.Core(
    name="joined_analysis_results",
    ctes=tyr.lineage.core.TableList([car_telemetry_analysis]),
    source=analysis_results_join,
    columns=tyr.lineage.core.ColumnList(
        [
            tyr.lineage.columns.Core(
                name=name,
                source=analysis_results_join.join_expression.right.columns[name],
            )
            for name in ["team_name", "broadcast_name"]
        ]
    )
    + tyr.lineage.macros.columns.select_all(analysis_results_join.join_expression.left),
)

display(conn.execute(joined_analysis_results.sql).df())

In [None]:
# There is no single method of achieving this result. Here is another method using a Subquery

left = tyr.lineage.tables.Subquery(car_telemetry_analysis)
right = tyr.lineage.tables.Select(staging.tables.results)

analysis_results_join = tyr.lineage.joins.Join(
    join_expression=tyr.lineage.expressions.LeftJoin(left=left, right=right),
    condition=tyr.lineage.core.Condition(
        checks=[
            tyr.lineage.expressions.Equal(
                tyr.lineage.columns.Select(left.columns.driver_number),
                tyr.lineage.columns.Select(right.columns.driver_number),
            )
        ]
    ),
)

joined_analysis_results_source = tyr.lineage.tables.Core(
    name="joined_analysis_results_source",
    source=analysis_results_join,
    columns=tyr.lineage.macros.columns.select_all(analysis_results_join),
    primary_key=tyr.lineage.macros.columns.select_all(
        analysis_results_join,
        filter_regex=rf"^(?!{'|'.join([column for column in analysis_results_join.join_expression.left.columns.list_names_()])}).*",
    ),
)

joined_analysis_results_source_subquery = tyr.lineage.tables.Subquery(
    source=joined_analysis_results_source,
)

joined_analysis_results = tyr.lineage.tables.Core(
    name="joined_analysis_results",
    source=joined_analysis_results_source_subquery,
    columns=tyr.lineage.core.ColumnList(
        tyr.lineage.macros.columns.select_all(joined_analysis_results_source_subquery)[
            ["session_key", "broadcast_name", "driver_number", "n_gear", "average_kmh"]
        ]
    ),
    inherit_primary_key=True,
)

print(joined_analysis_results.sql)
display(conn.execute(joined_analysis_results.sql).df())

In [None]:
conn.close()