## Ingest SEC DERA data into Trino pipeline

Copyright (C) 2024 OS-Climate

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Contributed by Michael Tiemann (Github: MichaelTiemannOSC)

Run these in a notebook cell if you need to install onto your nb env

```python
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

In [None]:
import os
import pathlib
from pathlib import Path
import shutil
from sqlalchemy import text
import io
import zipfile
import datetime

import boto3

import osc_ingest_trino as osc

import pandas as pd
import re

In [None]:
os.system('date')

In [None]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

# Load environment variables from credentials.env
osc.load_credentials_dotenv()

Open a Trino connection using JWT for authentication

In [None]:
iceberg_catalog = 'osc_datacommons_dev'
iceberg_schema = 'dera'
dera_table_prefix = 'dera_'

engine = osc.attach_trino_engine(verbose=True, catalog=iceberg_catalog, schema=iceberg_schema)
cxn = engine.connect()

In [None]:
# Show available schemas to ensure trino connection is set correctly
schema_read = cxn.execute(text(f'show schemas in {iceberg_catalog}'))
if schema_read.returns_rows:
    for row in schema_read.fetchall():
        print(row)

In [None]:
# bucket must be configured with credentials for trino, and accessible to the hive catalog
# You may need to use a different prefix here depending on how you name your credentials.env variables
hive_bucket = osc.attach_s3_bucket('S3_HIVE')

In [None]:
# create demo table named per user
# avoids problems with users reusing table names and associated permission problems
uniq = os.environ['TRINO_USER']
uniq

In [None]:
hive_catalog = 'osc_datacommons_hive_ingest'
hive_schema = 'ingest'

In [None]:
schema_create = osc._do_sql(f"""
create schema if not exists osc_datacommons_dev.{iceberg_schema}
WITH (
    location = 's3a://osc-datacommons-s3-bucket-dev02/data/{iceberg_schema}.db/'
)
""", engine, True)

In [None]:
s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
source_bucket = s3_source.Bucket(os.environ['S3_LANDING_BUCKET'])

Drop previous tables and schema to start with a fresh slate

In [None]:
sql = f"show tables in {iceberg_schema}"
qres = osc._do_sql(sql, engine, verbose=True)
print(qres)

Initialize DBT disctionary we will write out as YML at the end

In [None]:
dbt_dict = {}
dbt_dict['models'] = {}

repo_root = Path().resolve().parent
models_dir = repo_root.joinpath("dbt", "dera_transform", "models")

shutil.rmtree(models_dir, ignore_errors=True)
os.mkdir(models_dir, mode=0o755)

Load `ticker` file (updated sporadically from https://www.sec.gov/include/ticker.txt)

In [None]:
if not os.path.isfile("/tmp/dera-ticker.txt"):
    ticker_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'SEC-DERA/ticker.txt')
    ticker_file.download_file("/tmp/dera-ticker.txt")
ticker_df = pd.read_csv("/tmp/dera-ticker.txt", names=['tname', 'cik'], header=None, sep='\t', dtype={'tname':'string','cik':'int64'}, engine='c')
ticker_dict = dict(zip(ticker_df.cik, ticker_df.tname))

ticker_df

In [None]:
# Fully spell out catalog, schema, and table because this is source code for the dbt environment
ticker_table = 'ticker'

with open(models_dir.joinpath(f"{ticker_table}.sql"), "w", encoding="utf-8") as f:
    print(f"writing to model file {ticker_table}.sql in directory {models_dir}")
    print("{{ config(materialized='view', view_security='invoker') }}" + f"""
with source_data as (
    select {', '.join(ticker_df.columns)}
    from {iceberg_catalog}.{iceberg_schema}.{ticker_table}_source
)
select * from source_data
""", file=f)

The following text describes DBT model properties

The following text describes DBT external properties

### FIXME

Due to the fact that we need to drop and reload sub, num, and tag just to get the tags right,
we drop the views here that depend on those tables, recreating them once the tables are stable.

In [None]:
# FIXME: right now incremental logic is essentially manual.
# Would be nice to cleanly update data on quarterly basis without monkey-patching here

incremental = False

In [None]:
if not incremental:
    for view in [ 'assets_by_lei', 'assets_usd_by_lei', 'assets_xyz_by_lei',
                  'cash_by_adsh_ddate', 'cash_by_lei', 'cash_usd_by_lei', 'cash_xyz_by_lei',
                  'debt_by_adsh_ddate', 'debt_by_lei', 'debt_usd_by_lei', 'debt_xyz_by_lei',
                  'financials_by_lei',
                  'float_by_lei', 'float_usd_by_lei', 'float_xyz_by_lei',
                  'fy_revenue_by_lei', 'fy_revenue_usd_by_lei', 'fy_revenue_xyz_by_lei',
                  'fy_income_by_lei', 'fy_income_usd_by_lei', 'fy_income_xyz_by_lei',
                ]:
        sql = f"drop view if exists {view}"
        qres = osc._do_sql(sql, engine, verbose=True)

In [None]:
columnschema = osc.create_table_schema_pairs(ticker_df)

qres = osc._do_sql(f"drop table if exists {ticker_table}_source", engine, verbose=True)

tabledef = f"""
create table if not exists {ticker_table}_source(
{columnschema}
) with (
partitioning = array['bucket(tname,10)'],
format = 'ORC'
)
"""
qres = osc._do_sql(tabledef, engine, verbose=True)
ticker_df.to_sql(f"{ticker_table}_source",
                 con=engine, schema=iceberg_schema, if_exists='append',
                 index=False,
                 method=osc.TrinoBatchInsert(batch_size = 15000, verbose = True))

Prepare GLEIF matching data

In [None]:
gleif_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],'mtiemann-GLEIF/DERA-matches.csv')
gleif_file.download_file(f'/tmp/dera-gleif.csv')
gleif_df = pd.read_csv(f'/tmp/dera-gleif.csv', header=0, sep=',', dtype=str, engine='c')
gleif_dict = dict(zip(gleif_df.name, gleif_df.LEI))

# Manual fixups discovered since processing
gleif_dict['GROUP SIMEC SA DE CV'] = '529900LCYCXPA0TZEU09'
gleif_dict['ENEL GENERACION CHILE S.A.'] = '549300PVHXUFEIE6LY50'
gleif_dict['POSCO HOLDINGS INC.'] = '988400E5HRVX81AYLM04'
gleif_dict['ARCHAEA ENERGY INC.'] = '549300ZBE567NNMH7V89'
gleif_dict['CLEANSPARK, INC.'] = '254900VO7KBRJQDGY810'
gleif_dict['ALGOMA STEEL GROUP INC.'] = '549300Q5EU337A1XCX27'
gleif_dict['ECO WAVE POWER GLOBAL AB (PUBL)'] = '5493003GP1XAFTYRJM76'
gleif_dict['CPFL ENERGY INC'] = '529900GBWSBDXN8GGM28'
gleif_dict['PAMPA ENERGY INC.'] = '254900QNIK0CVURGML24'
gleif_dict['ENERGY CO OF MINAS GERAIS'] = '254900W703PXLDSEM056'
gleif_dict['BRAZILIAN ELECTRIC POWER CO'] = '254900I8KYDELP4B4Z08'

# And in the steel portfolio
gleif_dict['FRIEDMAN INDUSTRIES INC'] = '549300VI5ADYNC8C3G47'

Helper functions to load the SUB, NUM, and TAG tables into Trino

In [None]:
# import re
import uuid

# Borrowed/stole this definition from SEC Corp Financials notebook...
float_tags = [
    'EntityPublicFloat',
    'StockholdersEquityIncludingPortionAttributableToNoncontrollingInterest',
    'FreeFloat',
    'PublicFloat',
    'PublicFloatValue',
]

# These are in priority preference order
share_tags = [
    'EntityCommonStockSharesOutstanding',
    'CommonStockSharesOutstanding',
    'SharesOutstanding',
    'WeightedAverageNumberOfDilutedSharesOutstanding',
    'WeightedAverageNumberOfSharesOutstandingBasic',
]

# We don't want anything that reports par value, such as CommonStockValue
shareprice_tags = [
    'SharePrice',
    'PerSharePrice',
    'MarketValuePerShare',
    'SaleOfStockPricePerShare',
    'CashPricePerOrdinaryShare',
    'TreasurySharesValuePerShare',
    'SharesOutstandingPricePerShare',
]

treasury_share_tags = [
    'TreasuryStockShares',
    'TreasuryStockShares1',
]

treasury_value_tags = [
    'TreasurySharesMarketValue',
    'FairValueOfTreasuryShares',
    'MarketValueOfTreasuryShares',
]

all_float_helper_tags = share_tags + shareprice_tags + treasury_share_tags + treasury_value_tags

dera_regex = re.compile(r' ?/.*$')
dera_df = {}

from math import floor

def generate_intermediate_ddate(df_orig):
    if len(df_orig)==1:
        return df_orig
    year1 = df_orig.iloc[0].ddate
    year2 = df_orig.iloc[1].ddate
    if (year1-year2).days > 731:
        print("gap years")
        print(df_orig.iloc[0:2])
    new_df = df_orig.iloc[[0]].copy()
    year_end = pd.to_datetime(f"{floor((year1.year+year2.year)/2.0)}1231", format='%Y%m%d', utc=True)
    new_df.ddate = year_end
    if year1.year==year2.year:
        print("same years")
        print(df_orig.iloc[0:2])
        new_df.value = (df_orig.iloc[0].value + df_orig.iloc[1].value)/2.0
    else:
        new_df.value = ((365.0-(year1-year_end).days)*df_orig.iloc[0].value + (year_end-year2).days*df_orig.iloc[1].value)/365.0
    new_df.version = df_orig.iloc[0].adsh
    print(f"adding fact ({new_df.tag})")
    return new_df

# When this function is called, we already know that we have no matches in FLOAT_TAGS.
# GROUPED_DF is grouped by ADSH and only for annual reports.  DDATE can be anything (because many reports look back 1-5 years)
# We are working these annual reports quarter by quarter for the quarter in which they are reported

def infer_float(grouped_df):
    new_df = pd.DataFrame()
    for key, item in grouped_df:
        df = grouped_df.get_group(key)
        # We have no overall float value.  Build from shares outstanding * price
        df_shares = df[df.tag.isin(share_tags) & df.value.gt(0)]
        if df_shares.empty:
            continue
        else:
            for share_tag in share_tags:
                if not df_shares[df_shares.tag==share_tag].empty:
                    df_shares = df_shares[df_shares.tag==share_tag]
                    break
            if len(df_shares[df_shares.ddate.dt.year==df_shares.fy.dt.year])>1:
                print("thinning shares")
                df_shares = df_shares[df_shares.ddate.dt.year==df_shares.fy.dt.year]
                df_shares = df_shares.sort_values('ddate', ascending=False)
        df_prices = df[df.tag.isin(shareprice_tags)]
        # We have no overall price.  Build from price derived from treasury valuation
        if df_prices.empty:
            df_treasury_shares = df[df.tag.isin(treasury_share_tags)]
            df_treasury_value = df[df.tag.isin(treasury_value_tags)]
            if df_treasury_shares.empty or df_treasury_value.empty:
                continue
            df_svp = df_treasury_value.merge(df_treasury_shares, on=['adsh', 'ddate', 'coreg'])
            if df_svp.empty:
                print(f"{df.adsh.iat[0]}: merge failed (1)")
                continue
            # Pick latest date / largest number of shares as basis
            df_float = df_svp.sort_values(['ddate', 'value_y'], ascending=False).iloc[[0]].copy()
            price_per_share = df_float.value_x.squeeze() / df_float.value_y.squeeze()
            df_float.rename(columns={'uom_x':'uom'},inplace=True)
            tag = 'ComputedTreasuryFloat'
        else:
            # if df_prices[df_prices.tag.str.startswith('ShareBasedCompensationArrangementByShareBasedPaymentAward')].empty:
            #     print(f"Must use market prices; len(df_prices) =  {len(df_prices)}")
            # else:
            #     print(f"Can use Share Based Comp {df_prices.tag.str[45:]}:\n{df_prices}\n\n")
            # We derive a price from market reports
            df_svp = df_prices.merge(df_shares, on=['adsh', 'ddate', 'coreg'])
            if df_svp.empty:
                if len (df_prices[df_prices.ddate.dt.year==df_prices.fy.dt.year])>1:
                    print("thinning prices")
                    df_prices = df_prices[df_prices.ddate.dt.year==df_prices.fy.dt.year]
                if len(df_prices)<3 and len(df_shares)<3:
                    df_prices = generate_intermediate_ddate(df_prices.sort_values('ddate', ascending=False))
                    df_float = generate_intermediate_ddate(df_shares.sort_values('ddate', ascending=False))
                    price_per_share = df_prices.value.squeeze()
                    # print("merge rescued (2)")
                    # display(df_shares)
                else:
                    print(f"{df.adsh.iat[0]}: merge failed (2)")
                    print(f"len(df_prices) = {len(df_prices)}")
                    print(f"len(df_shares) = {len(df_shares)}")
                    display(df_prices)
                    display(df_shares)
                    continue
            else:
                # Pick latest date / largest number of shares as basis
                df_float = df_svp.sort_values(['ddate', 'value_y'], ascending=False).iloc[[0]]
                price_per_share = df_float.value_x.squeeze() # value_x is a price in this case
                df_float.rename(columns={'uom_x':'uom'},inplace=True)
            tag = 'ComputedMarketFloat'
        df_float = df_float[['adsh', 'ddate', 'uom', 'coreg']]
        df_float['tag'] = tag
        # TODO: should connect price ddate with total shares ddate
        total_shares = df_shares.iloc[0].value
        df_float['value'] = price_per_share * total_shares
        df_float['qtrs'] = 0
        df_float['srcdir'] = 'computed'
        df_float['version'] = 'osc_dera_ingest'
        df_float['footnote'] = pd.NA
        df_float = df_float.astype(df.drop(columns=['fy','fp']).dtypes.to_dict())
        new_df = pd.concat([new_df, df_float])
    return new_df

def read_dera_table(zf, fy_qtr, tbl):
    """From a local file ZF, read data for the period FY_QTR for the DERA table TBL.
    Return the Dataframe created so that when it is time to create the actual Trino table
    we know what the shape of the data should look like.  The returned DF has all the data
    of the specific ingestion, not all the data of all the ingestions of data for TBL."""
    global dera_df

    df = pd.read_csv(zf, header=0, sep='\t', dtype='string', keep_default_na=False, nrows = None, engine='c')
    df['srcdir'] = fy_qtr
    df.srcdir = df.srcdir.astype('string')

    # df = df.convert_dtypes (infer_objects=False, convert_string=True, convert_integer=False, convert_boolean=False, convert_floating=False)
    # Print the output
    # print(df.dtypes)

    if tbl=='sub':
        # While the documentation of the DERA SUB table says that FY must be non-NULL, many forms don't specify a fiscal year
        # (forms 1, 2, 3, 4, 6, 8, 10-12, POS AM, POS EX, 425, etc).  And none of those forms relate to data we are trying to ingest,
        # so we drop them.
        df = df[df.fy.ne('') & df.period.ne('')]
        df.name = df.name.map(lambda x: re.sub(dera_regex, '', x))
        df.name = df.name.astype('string')
        df['LEI'] = df.name.map(gleif_dict)
        df.LEI = df.LEI.astype('string')
        df.cik = df.cik.astype('int32')
        df.loc[df.sic=='', 'sic'] = pd.NA
        df.sic = df.sic.astype('Int16')
        df.loc[df.ein=='', 'ein'] = pd.NA
        df.ein = df.ein.astype('Int64')
        df.wksi = df.wksi.astype('bool')
        # df.wksi = df.wksi.astype('int32')
        df.period = pd.to_datetime(df.period, format='%Y%m%d', utc=True, errors='coerce')
        df.fy = pd.to_datetime(df.fy, format='%Y', utc=True) # errors='coerce'
        df.filed = pd.to_datetime(df.filed, format='%Y%m%d', utc=True)
        df.accepted = pd.to_datetime(df.accepted, utc=True)  # format='%Y-%m-%d %H:%M:%S' but sometimes format='%Y-%m-%d %H:%M:%S.%f'
        df.prevrpt = df.prevrpt.astype('bool')
        df.detail = df.detail.astype('bool')
        df.nciks = df.nciks.astype('int16')

        cols = df.columns.tolist()
        # Move LEI to a more friendly location in the column order
        cols = cols[0:3] + [cols[-1]] + cols[3:-1]
        df = df[cols]
    elif tbl=='num':
        # documentation wrongly lists coreg as NUMERIC length 256.  It is ALPHANUMERIC.
        if fy_qtr=='2017q1':
            df.loc[df.ddate=='21051130', 'ddate'] = '20151130'
            df.loc[df.ddate=='21061130', 'ddate'] = '20161130'
        elif fy_qtr=='2017q3':
            df.loc[df.ddate=='60160630', 'ddate'] = '20160630'
        elif fy_qtr=='2018q2':
            df.loc[df.ddate=='22011231', 'ddate'] = '20211231'
        elif fy_qtr=='2019q1':
            df.loc[df.ddate=='21080430', 'ddate'] = '20180430'
            df.loc[df.ddate=='21081031', 'ddate'] = '20181031'
        elif fy_qtr=='2019q2':
            df.loc[df.ddate=='29171231', 'ddate'] = '20171231'
        elif fy_qtr=='2021q3':
            df.loc[df.ddate=='30210630', 'ddate'] = '20210630'
        elif fy_qtr=='2022q1':
            df.loc[df.ddate=='21211231', 'ddate'] = '20211231'
        elif fy_qtr=='2024q1':
            df.loc[df.ddate=='29230930', 'ddate'] = '20230930'
            # Drop specific entries that are duplicated for both the dates 20231231 and 29231231
            # We drop the bad year, keeping the good.  Following, we change remaining bad years to good
            df = df[df.adsh.ne('0001654954-24-003228')|df.ddate.ne('29231231')|~df.tag.isin(['OperatingLeaseLiabilityCurrent', 'OperatingLeaseRightOfUseAsset'])]
            df.loc[df.ddate=='29231231', 'ddate'] = '20231231'

        # Fix some bad AES data
        if fy_qtr=='2021q1':
            df.loc[(df.adsh=='0000874761-21-000015')&(df.tag=='CommonStockValue')&(df.ddate=='20190630'), 'ddate'] = '20200630'
        if fy_qtr=='2020q1':
            df.loc[(df.adsh=='0000874761-20-000012')&(df.tag=='EntityPublicFloat')&(df.ddate=='20180630'), 'ddate'] = '20190630'
            df.loc[(df.adsh=='0000874761-20-000012')&(df.tag=='CommonStockValue')&(df.ddate=='20180630'), 'ddate'] = '20190630'
        df.ddate = pd.to_datetime(df.ddate, format='%Y%m%d', utc=True)
        df.qtrs = df.qtrs.astype('int16')
        df.loc[df.coreg=='', 'coreg'] = pd.NA
        df.loc[df.value=='', 'value'] = pd.NA
        df.value = df.value.astype('Float64')
        df.loc[df.footnote=='', 'footnote'] = pd.NA

        print(f"Inferring floats: start len(df) = {len(df)}")
        annual_df = dera_df['sub'][dera_df['sub'].form.isin(['10-K','20-F','40-F'])]
        df['fy'] = df.adsh.map(dict(zip(annual_df.adsh,annual_df.fy)))
        df['fp'] = df.adsh.map(dict(zip(annual_df.adsh,annual_df.fp)))
        print(f"len(df[df.fp=='FY']) = {len(df[df.fp=='FY'])}")
        df = df[df.fp=='FY']
        # df = df.assign(cik=df.adsh.str[:10])
        df_has_float = df[df.tag.isin(float_tags)]
        print(f"len(df_has_float) = {len(df_has_float)}")
        df_needs_float = df[~df.adsh.isin(df_has_float.adsh)]
        print(f"len(df_needs_float) = {len(df_needs_float)}")
        float_df = infer_float(df_needs_float[df_needs_float.coreg.isna()
                                              &df_needs_float.tag.isin(all_float_helper_tags)
                                              &(df_needs_float.value>0)].groupby(['adsh'], as_index=False))
        df = df.drop(columns=['fy','fp'])
        # print(df.dtypes)
        # print(float_df.dtypes)
        if float_df.empty:
            print(f"{len(float_df)} floats inferred; Sorry!")
        else:
            float_df = float_df.astype(df.dtypes.to_dict())
            df = pd.concat([df, float_df])
            print(f"{len(float_df)} floats inferred; {len(float_df[float_df.tag=='ComputedTreasuryFloat'])} treasury-based; {len(float_df[float_df.tag=='ComputedMarketFloat'])} market-based")
    elif tbl=='tag':
        df.custom = df.custom.astype('bool')
        df.abstract = df.abstract.astype('bool')
        df.loc[df.crdr=='', 'crdr'] = pd.NA
        df.loc[df.tlabel=='', 'tlabel'] = pd.NA
        df.loc[df.doc=='', 'doc'] = pd.NA
    # print(df.dtypes)
    # display(df.head())

    return df

In [None]:
def append_to_trino_table_with_dbt_metadata(tablename, df, partition_columns=[], custom_meta_content='', custom_meta_fields='', verbose=False):
    global engine, models_dir
    iceberg_table = f'{dera_table_prefix}{tablename}'

    if custom_meta_content:
        dbt_models = dbt_dict['models']
        dbt_models[iceberg_table] = dbt_table = { 'description': custom_meta_content['description']}
        if custom_meta_fields:
            dbt_table['columns'] = dbt_columns = (
                { name: {'description': custom_meta_fields[name]['Description'] } for name in custom_meta_fields.keys() }
            )
            for name in custom_meta_fields.keys():
                if 'tags' in custom_meta_fields[name].keys():
                    dbt_columns[name]['tags'] = custom_meta_fields[name]['tags']
    elif custom_meta_fields:
        raise VALUE_ERROR

    osc.fast_pandas_ingest_via_hive(
        df,
        engine,
        iceberg_catalog, iceberg_schema, f"{iceberg_table}_source",
        hive_bucket, hive_catalog, hive_schema,
        partition_columns = partition_columns,
        overwrite = False,
        typemap={"datetime64[ns]":"timestamp(6)", "datetime64[ns, UTC]":"timestamp(6)",
                 "Int16":"integer", "int16":"integer"},
        verbose = verbose
    )

    # Fully spell out catalog, schema, and table because this is source code for the dbt environment
    with open(models_dir.joinpath(f"{iceberg_table}.sql"), "w", encoding="utf-8") as f:
        if verbose:
            print(f"writing to model file {iceberg_table}.sql in directory {models_dir}")
        print("{{ config(materialized='view', view_security='invoker') }}" + f"""
with source_data as (
    select {', '.join(df.columns)}
    from {iceberg_catalog}.{iceberg_schema}.{iceberg_table}_source
)
select * from source_data
""", file=f)

In [None]:
dera_tablenames = ['tag', 'sub', 'num']

if not incremental:
    for tbl in dera_tablenames:
        qres = osc._do_sql(f"drop table if exists {dera_table_prefix}{tbl}_source", engine, verbose=True)

In [None]:
objects=source_bucket.objects.filter(Prefix='SEC-DERA/20')

In [None]:
dera_tag = pd.DataFrame()
batch_size = {'num':1500,'sub':1500,'tag':1000}

for obj in objects:
    if incremental:
        # Monkey-patch here if you want to incrementally load objects
        if obj.key != 'SEC-DERA/2024q1.zip':
            continue
        print(f"loading {obj.key}")
    if obj.key.endswith('.zip'):
        zipfile_src = s3_source.Object(os.environ['S3_LANDING_BUCKET'],obj.key)
        tmpname = obj.key.split('/')[-1]
        zipfile_src.download_file(f'/tmp/{tmpname}')
        zipfile_obj = zipfile.ZipFile(f'/tmp/{tmpname}', mode='r')
        fy_qtr = tmpname.split('.')[0]
        for tbl in dera_tablenames:
            print(f'{fy_qtr} - {tbl}')
            with zipfile_obj.open(f"{tbl}.txt") as zf:
                # Read data from ZF into a dataframe.
                dera_df[tbl] = read_dera_table (zf, fy_qtr, tbl)
                if tbl == 'tag':
                    dera_tag = pd.concat([dera_tag, dera_df[tbl]]).drop_duplicates(subset=['tag','version'])
        zipfile_obj.close()

        if True:
            # Alas, there is some minor post-fixing we need to do before ingesting into parquet
            df = dera_df['num']
            num_fields = df.columns
            df = df[df.tag=='ComputedTreasuryFloat']
            treasury_df = dera_df['sub'].loc[dera_df['sub'].fp=='FY', ['adsh', 'cik', 'name','fye', 'fy', ]].merge(df, on='adsh')
            if (len(treasury_df)>0):
                display(treasury_df)
                grouped_df = treasury_df.groupby('cik')
                for key, item in grouped_df:
                    if len(item)==1:
                        df = item[num_fields].copy()
                        df.ddate = pd.to_datetime(f"{item.fy.squeeze().year}1231", format='%Y%m%d', utc=True)
                        df.version = item.adsh.squeeze()
                        # df = df.astype(dera_df['num'].dtypes.to_dict())
                        print("adding fact (1)")
                        print(df)
                        df = df.astype(dera_df['num'].dtypes.to_dict())
                        dera_df['num'] = pd.concat([dera_df['num'], df])
                    else:
                        item = item.sort_values('ddate', ascending=False).reset_index()
                        df = generate_intermediate_ddate_value(item.iloc[0:2])
                        dera_df['num'] = pd.concat([dera_df['num'], df])
            for tbl in dera_tablenames:
                if tbl=='tag':
                    # Only copy out the new tags in this zipfile.  Older data is already stored
                    df = dera_tag[dera_tag.srcdir.eq(fy_qtr)]
                    print(f"len(dera_tag) = {len(dera_tag)}")
                    print(f"len(df) = {len(df)}")
                else:
                    df = dera_df[tbl]
                # append_to_trino_table_with_dbt_metadata will prepend dera_table_prefix
                append_to_trino_table_with_dbt_metadata(f"{tbl}", df, partition_columns=['srcdir'], custom_meta_content='', custom_meta_fields='', verbose=False)
        else:
            # Import into Trino the slow way...
            for tbl in dera_tablenames:
                ingest_table = f"{dera_table_prefix}{tbl}"
                columnschema = osc.create_table_schema_pairs(dera_df[tbl],
                                                             typemap={"int16":"integer", "Int16":"integer",
                                                                      "datetime64[ns, UTC]":"timestamp(6)"})
                tabledef = f"""
create table if not exists {ingest_table}(
{columnschema}
) with (
    partitioning = array['srcdir'],
    format = 'ORC'
)
"""
                print(tabledef)
                qres = osc._do_sql(tabledef, engine, verbose=False)
                if tbl=='tag':
                    # Only copy out the new tags in this zipfile.  Older data is already stored
                    df = dera_tag[dera_tag.srcdir.eq(fy_qtr)]
                else:
                    df = dera_df[tbl]
                df.to_sql(ingest_table,
                          con=engine, schema=ingest_schema, if_exists='append',
                          index=False,
                          method=osc.TrinoBatchInsert(batch_size=batch_size[tbl], verbose = False))

In [None]:
# Borrow metadata code from DERA-iceberg if/when we need it

In [None]:
qres = osc._do_sql(f"select count(*) from {iceberg_catalog}.{iceberg_schema}.ticker_source", engine)
print("ticker_source count(*) = " + str(qres[0][0]))
for tbl in dera_tablenames:
    qres = osc._do_sql(f"select count (*),srcdir from {iceberg_catalog}.{iceberg_schema}.{dera_table_prefix}{tbl}_source group by srcdir order by srcdir", engine)
    print(f"tbl = {tbl}")
    for row in qres:
        print(f"{dera_table_prefix}{tbl}[{row[1]}] count(*) = {row[0]}")
    print("")
# print(list(zip(tablenames, l)))

Right now we don't really populate data with metadata using code.  We copy in dera_base_schema.yml
that has all the metadata (and data quality checks) from the SEC DERA data dictionary.

In [None]:
dbt_yml = open(models_dir.joinpath("dera_schema.yml"), "w", encoding="utf-8")

In [None]:
print("version: 2", file=dbt_yml)

indent = 0
print("\nmodels:", file=dbt_yml)
indent = indent + 2
for name in dbt_dict['models']:
    model = dbt_dict['models'][name]
    print(f"{' '*indent}- name: {name}", file=dbt_yml)
    indent = indent + 2
    print(f"{' '*indent}description: {model['description']}", file=dbt_yml)
    print(f"\n{' '*indent}columns:", file=dbt_yml)
    indent = indent + 2
    columns = model['columns']
    for col in columns:
        print(f"{' '*indent}- name: {col}", file=dbt_yml)
        indent = indent + 2
        for col_meta in columns[col].keys():
            print(f"{' '*indent}{col_meta}: {columns[col][col_meta]}", file=dbt_yml)
        indent = indent - 2
    print("", file=dbt_yml) # newline comes for free...
    indent = indent - 4
indent = indent - 2
assert(indent==0)

In [None]:
dbt_yml.close()

In [None]:
os.system('date')