# Project Schema

The project schema draws from staging/datalake tables to perform analysis, create dashboards, etc.
It is designed for custom tables, and the process for interacting with it is slightly different to staging and source.

In [1]:
import pandas as pd
from units.core import Unit

import tyr

# Required to get relative path of test datasets
import os

from pprint import pprint

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [2]:
# Load schema from pkl
source = tyr.lineage.schema.core.load_schema_from_pkl(
    os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema/source.pkl"))
)
staging = tyr.lineage.schema.core.load_schema_from_pkl(
    os.path.abspath(os.path.join(os.getcwd(), "..", "tests/saved_schema/staging.pkl"))
)

In [3]:
# Defining connection
conn = tyr.database.connections.Connection(
    name="test",
    syntax="duckdb",
    database=os.path.abspath(os.path.join(os.getcwd(), "..", "tests/test.duckdb")),
    read_only=False,
)

# Understanding the Project Schema

The project schema does not take tables as an attribute to initialize. Instead, tables must be added after it has been initialized.
This is because tables within the Project schema can depend on other tables in the project schema. To ensure lineage is correctly defined,
they must exist in the project schema before they are called as the source of a different table. 

In [4]:
staging.tables.car_telemetry.primary_key.list_names_()

['event_ts', 'driver_number']

In [5]:
# Let's start a new project called singapore

singapore = tyr.lineage.schema.project.Project(
    tyr.lineage.schema.project.ProjectSettings(
        name="singapore",
    )
)


# We'll start by performing an event time transform on all tables that contain an event time within staging.
# A function to perform this transformation already exists in lineage.macros

# Let's first define an interval to transform the data to
# 500 millissecond should be sufficient for now
interval = tyr.lineage.values.Interval(500, Unit("ms^1"))

for table in staging.tables.list_tables_():
    if table.event_time:
        print(table.name)
        print(table.static_primary_key.list_names_())
        print(table.event_time.name)
        singapore.add_table(
            tyr.lineage.macros.tables.event_time_interval_transform(
                name=table.name,
                source=tyr.lineage.tables.Select(table),
                interval=interval,
            ),
            override=True,
        )
    else:
        singapore.add_table(
            tyr.lineage.tables.Core(
                source=table,
                name=table.name,
                columns=tyr.lineage.macros.columns.select_all(table),
                inherit_primary_key=True,
                inherit_event_time=True,
            ),
            override=True,
        )

car_location
['driver_number']
event_ts
car_telemetry
['driver_number']
event_ts
race_control
['category']
event_ts
session_status
['session_key']
session_time
track_status
['status']
event_ts


In [8]:
conn.execute(singapore.tables.car_location.sql + " LIMIT 1000").df()

Unnamed: 0,driver_number,event_ts,status,x,y,z,source,meeting_time,session_time
0,4,2023-09-17 11:01:11.000,OnTrack,0.0,0.0,0.0,pos,0 days 00:00:10.974000,0 days 00:00:10.974000
1,4,2023-09-17 11:01:19.500,OnTrack,0.0,0.0,0.0,pos,0 days 00:00:19.253000,0 days 00:00:19.253000
2,4,2023-09-17 11:02:32.500,OnTrack,0.0,0.0,0.0,pos,0 days 00:01:32.373000,0 days 00:01:32.373000
3,4,2023-09-17 11:03:10.000,OnTrack,0.0,0.0,0.0,pos,0 days 00:02:09.894000,0 days 00:02:09.894000
4,4,2023-09-17 11:03:30.500,OnTrack,0.0,0.0,0.0,pos,0 days 00:02:30.294000,0 days 00:02:30.294000
5,4,2023-09-17 11:04:42.000,OnTrack,0.0,0.0,0.0,pos,0 days 00:03:41.954000,0 days 00:03:41.954000
6,4,2023-09-17 11:05:16.000,OnTrack,0.0,0.0,0.0,pos,0 days 00:04:15.834000,0 days 00:04:15.834000
7,4,2023-09-17 11:05:42.000,OnTrack,0.0,0.0,0.0,pos,0 days 00:04:41.934000,0 days 00:04:41.934000
8,4,2023-09-17 11:06:18.500,OnTrack,0.0,0.0,0.0,pos,0 days 00:05:18.274000,0 days 00:05:18.274000
9,4,2023-09-17 11:06:22.500,OnTrack,0.0,0.0,0.0,pos,0 days 00:05:22.413000,0 days 00:05:22.413000
