# Ingest SEC DERA data into Trino pipeline

Copyright (C) 2021 OS-Climate

This sample shows:
* How to create schemas and tables via the Trino / SQLAlchemy on an underlying Iceberg data volume
* Apache Iceberg ACID transaction and time travel capabilities used for data set versioning


Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Contributed by Michael Tiemann (Github: MichaelTiemannOSC)

Run these in a notebook cell if you need to install onto your nb env

```python
# 'capture' magic prevents long outputs from spamming your notebook
%%capture pipoutput

# For loading predefined environment variables from files
# Typically used to load sensitive access credentials
%pip install python-dotenv

# Standard python package for interacting with S3 buckets
%pip install boto3

# Interacting with Trino and using Trino with sqlalchemy
%pip install trino sqlalchemy sqlalchemy-trino

# Pandas and parquet file i/o
%pip install pandas pyarrow fastparquet

# OS-Climate utilities to make data ingest easier
%pip install osc-ingest-tools
```

Load Environment Variables

In [1]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [2]:
import boto3

s3_source = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)

In [3]:
s3_trino = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)
bucket = s3_trino.Bucket(os.environ["S3_DEV_BUCKET"])
bucket.name

'ocp-odh-os-demo-s3'

In [4]:
ingest_catalog = 'osc_datacommons_iceberg_dev'
ingest_schema = 'sec_dera'

In [5]:
# telling sqlalchemy about catalog has to be done in the sqlstring url:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/{catalog}'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT'],
    catalog = ingest_catalog
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

Create custom meta data and declare variable for schema and table for the data set

In [6]:
# make sure schema exists, or table creation below will fail in weird ways
sql = f"""
create schema if not exists {ingest_catalog}.{ingest_schema}
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(True,)]


Enter the Pandas!

In [7]:
import pandas as pd

Prepare GLEIF matching data

In [8]:
gleif_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],f'mtiemann-GLEIF/DERA-matches.csv')
gleif_file.download_file(f'/tmp/dera-gleif.csv')
gleif_df = pd.read_csv(f'/tmp/dera-gleif.csv', header=0, sep=',', dtype=str, engine='c')
gleif_dict = { k:v for k, v in zip(gleif_df.name, gleif_df.LEI) }

Load the SUB and NUM tables into Trino.  READS ONLY 10 ROWS RIGHT NOW!

In [9]:
# import osc_ingest_trino as osc

_p2smap = {
    'string': 'varchar',
    'float32': 'real',
    'Float32': 'real',
    'float64': 'double',
    'Float64': 'double',
    'int32': 'integer',
    'Int32': 'integer',
    'int64': 'bigint',
    'Int64': 'bigint',
    'bool': 'boolean',
    'category': 'varchar',
    'datetime64[ns, UTC]': 'timestamp(6)',
}

def pandas_type_to_sql(pt):
    st = _p2smap.get(pt)
    if st is not None:
        return st
    raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))

# add ability to specify optional dict for specific fields?
# if column name is present, use specified value?
def create_table_schema_pairs(df):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("df must be a pandas DataFrame")
    ptypes = [str(e) for e in df.dtypes.to_list()]
    stypes = [pandas_type_to_sql(e) for e in ptypes]
    pz = list(zip(df.columns.to_list(), stypes))
    return ",\n".join(["    {n} {t}".format(n=e[0],t=e[1]) for e in pz])

In [16]:
import re
import io
import uuid

# Add a unique identifier to the data set
uid = str(uuid.uuid4())

dera_regex = re.compile(r' ?/.*$')

quarters = ['2020q4', '2021q1', '2021q2', '2021q3']

def ingest_dera_table(qtr, tbl):
    src_file = s3_source.Object(os.environ['S3_LANDING_BUCKET'],f'SEC-DERA/{qtr}/{tbl}.txt')
    timestamp = src_file.last_modified.isoformat()
    src_file.download_file(f'/tmp/dera-{tbl}-{timestamp}.csv')
    df = pd.read_csv(f'/tmp/dera-{tbl}-{timestamp}.csv', header=0, sep='\t', dtype='string', keep_default_na=False, nrows = 10, engine='c')
    
    df['uuid'] = uid
    df['quarter'] = qtr
    df = df.convert_dtypes (infer_objects=False, convert_string=True, convert_integer=False, convert_boolean=False, convert_floating=False)
    # Print the output
    # print(df.dtypes)
    
    if tbl=='sub':
        df.name = df.name.map(lambda x: re.sub(dera_regex, '', x))
        df.name = df.name.astype('string')
        df['LEI'] = df.name.map(gleif_dict)
        df.LEI = df.LEI.astype('string')
        df.cik = df.cik.astype('int32')
        df.sic = df.sic.astype('int32')
        df.ein = df.ein.astype('int64')
        df.wksi = df.wksi.astype('bool')
        # df.wksi = df.wksi.astype('int32')
        df.period = pd.to_datetime(df.period, format='%Y%m%d', utc=True)
        df.loc[df.fy!=''].assign(fy = lambda x: pd.to_datetime(x.fy, format='%Y', utc=True))
        df.filed = pd.to_datetime(df.filed, format='%Y%m%d', utc=True)
        df.accepted = pd.to_datetime(df.accepted, format='%Y-%m-%d %H:%M:%S', utc=True)
        df.prevrpt = df.prevrpt.astype('bool')
        df.detail = df.detail.astype('bool')
        # df.prevrpt = df.prevrpt.astype('int32')
        # df.detail = df.detail.astype('int32')
        df.nciks = df.nciks.astype('int32')
        
        cols = df.columns.tolist()
        cols = cols[0:3] + [cols[-1]] + cols[3:-1]
        df = df[cols]
    elif tbl=='num':
        # documentation wrongly lists coreg as NUMERIC length 256.  It is ALPHANUMERIC.
        df.ddate = pd.to_datetime(df.ddate, format='%Y%m%d', utc=True)
        df.qtrs = df.qtrs.astype('int32')
        df.value = df.value.astype('float64')
    print(df.dtypes)
    display(df.head())

    # Only drop table and create new one if we are starting from the beginning
    if qtr==quarters[0]:
        table_check = engine.execute(f'drop table if exists {ingest_catalog}.{ingest_schema}.{tbl}')
        for row in table_check.fetchall():
            print(row)

        columnschema = create_table_schema_pairs(df)

        tabledef = f"""
        create table if not exists {ingest_catalog}.{ingest_schema}.{tbl}(
        {columnschema}
        ) with (
            format = 'ORC',
            partitioning = array['quarter']
        )
        """
        print(tabledef)
        qres = engine.execute(tabledef)
        print(qres.fetchall())

        # The following is redundant...
        sql=f"""
        delete from {ingest_catalog}.{ingest_schema}.{tbl}
        """
        print(sql)
        qres = engine.execute(sql)
        print(qres.fetchall())

    # method = 'multi' is important, default will not work
    # important to tell it about schema here, and catalog when you create the db connection above
    # index = False, unless you declared that as a column when you create the table
    # use 'append' mode since we already created the table
    df.to_sql(tbl,
              con=engine,
              schema=ingest_schema,
              if_exists='append',
              index=False,
              method='multi')
    
    sql=f"""
    select * from {ingest_catalog}.{ingest_schema}.{tbl} limit 10
    """
    pd.read_sql(sql, engine)

In [17]:
for qtr in quarters:
    for ingest_table in [ 'sub', 'num' ]:
        ingest_dera_table(qtr, ingest_table)

adsh                       string
cik                         int32
name                       string
LEI                        string
sic                         int32
countryba                  string
stprba                     string
cityba                     string
zipba                      string
bas1                       string
bas2                       string
baph                       string
countryma                  string
stprma                     string
cityma                     string
zipma                      string
mas1                       string
mas2                       string
countryinc                 string
stprinc                    string
ein                         int64
former                     string
changed                    string
afs                        string
wksi                         bool
fye                        string
form                       string
period        datetime64[ns, UTC]
fy                         string
fp            

Unnamed: 0,adsh,cik,name,LEI,sic,countryba,stprba,cityba,zipba,bas1,...,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,uuid,quarter
0,0000010254-20-000105,10254,EARTHSTONE ENERGY INC,54930081UTV7CK45XG14,1311,US,TX,THE WOODLANDS,77380,1400 WOODLOCH FOREST DRIVE,...,,2020-10-01 00:00:00+00:00,2020-10-01 16:05:00+00:00,True,True,form8-kxbbredeterminat_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
1,0000016918-20-000236,16918,"CONSTELLATION BRANDS, INC.",5493005GKEG8QCVY7037,2080,US,NY,VICTOR,14564,207 HIGH POINT DRIVE,...,,2020-10-01 00:00:00+00:00,2020-10-01 08:04:00+00:00,True,True,stz-20200930_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
2,0000016918-20-000237,16918,"CONSTELLATION BRANDS, INC.",5493005GKEG8QCVY7037,2080,US,NY,VICTOR,14564,207 HIGH POINT DRIVE,...,Q2,2020-10-01 00:00:00+00:00,2020-10-01 14:33:00+00:00,True,True,stz-20200831_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
3,0000022444-20-000035,22444,COMMERCIAL METALS CO,549300OQS2LO07ZJ7N73,3312,US,TX,IRVING,75039,"6565 N. MACARTHUR BLVD., SUITE 800",...,,2020-10-01 00:00:00+00:00,2020-10-01 09:45:00+00:00,True,True,cmc-20201001_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
4,0000034088-20-000081,34088,EXXON MOBIL CORP,J3WHBG0MTS7O8ZVMDC91,2911,US,TX,IRVING,75039-2298,5959 LAS COLINAS BLVD,...,,2020-10-01 00:00:00+00:00,2020-10-01 07:02:00+00:00,True,True,r8k100120_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4


(True,)

        create table if not exists osc_datacommons_iceberg_dev.sec_dera.sub(
            adsh varchar,
    cik integer,
    name varchar,
    LEI varchar,
    sic integer,
    countryba varchar,
    stprba varchar,
    cityba varchar,
    zipba varchar,
    bas1 varchar,
    bas2 varchar,
    baph varchar,
    countryma varchar,
    stprma varchar,
    cityma varchar,
    zipma varchar,
    mas1 varchar,
    mas2 varchar,
    countryinc varchar,
    stprinc varchar,
    ein bigint,
    former varchar,
    changed varchar,
    afs varchar,
    wksi boolean,
    fye varchar,
    form varchar,
    period timestamp(6),
    fy varchar,
    fp varchar,
    filed timestamp(6),
    accepted timestamp(6),
    prevrpt boolean,
    detail boolean,
    instance varchar,
    nciks integer,
    aciks varchar,
    uuid varchar,
    quarter varchar
        ) with (
            format = 'ORC',
            partitioning = array['quarter']
        )
        
[(True,)]

        delete from osc_dat

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote,uuid,quarter
0,0001640334-20-002446,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-08-31 00:00:00+00:00,0,USD,3500.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
1,0001640334-20-002446,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-05-31 00:00:00+00:00,0,USD,4225.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
2,0001104659-20-111952,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2018-12-31 00:00:00+00:00,0,USD,56149000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
3,0001104659-20-111952,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2019-12-31 00:00:00+00:00,0,USD,85295000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4
4,0001104659-20-111952,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-06-30 00:00:00+00:00,0,USD,122946000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2020q4


(True,)

        create table if not exists osc_datacommons_iceberg_dev.sec_dera.num(
            adsh varchar,
    tag varchar,
    version varchar,
    coreg varchar,
    ddate timestamp(6),
    qtrs integer,
    uom varchar,
    value double,
    footnote varchar,
    uuid varchar,
    quarter varchar
        ) with (
            format = 'ORC',
            partitioning = array['quarter']
        )
        
[(True,)]

        delete from osc_datacommons_iceberg_dev.sec_dera.num
        
[(None,)]
adsh                       string
cik                         int32
name                       string
LEI                        string
sic                         int32
countryba                  string
stprba                     string
cityba                     string
zipba                      string
bas1                       string
bas2                       string
baph                       string
countryma                  string
stprma                     string
cityma             

Unnamed: 0,adsh,cik,name,LEI,sic,countryba,stprba,cityba,zipba,bas1,...,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,uuid,quarter
0,0000002178-21-000005,2178,"ADAMS RESOURCES & ENERGY, INC.",UP28TXL7K8XSRNNC7V02,5172,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,...,,2021-02-19 00:00:00+00:00,2021-02-19 07:10:00+00:00,True,True,ae-20210219_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
1,0000002178-21-000031,2178,"ADAMS RESOURCES & ENERGY, INC.",UP28TXL7K8XSRNNC7V02,5172,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,...,,2021-03-04 00:00:00+00:00,2021-03-04 16:05:00+00:00,True,True,ae-20210304_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
2,0000002178-21-000034,2178,"ADAMS RESOURCES & ENERGY, INC.",UP28TXL7K8XSRNNC7V02,5172,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,...,FY,2021-03-05 00:00:00+00:00,2021-03-05 15:08:00+00:00,True,True,ae-20201231_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
3,0000002488-21-000002,2488,ADVANCED MICRO DEVICES INC,R2I72C950HOYXII45366,3674,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,...,,2021-01-06 00:00:00+00:00,2021-01-06 16:27:00+00:00,True,True,amd-20201231_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
4,0000002488-21-000005,2488,ADVANCED MICRO DEVICES INC,R2I72C950HOYXII45366,3674,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,...,,2021-01-12 00:00:00+00:00,2021-01-12 16:27:00+00:00,True,True,amd-20210111_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1


adsh                     string
tag                      string
version                  string
coreg                    string
ddate       datetime64[ns, UTC]
qtrs                      int32
uom                      string
value                   float64
footnote                 string
uuid                     string
quarter                  string
dtype: object


Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote,uuid,quarter
0,0000024545-21-000004,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-12-31 00:00:00+00:00,0,USD,2889500000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
1,0000024545-21-000004,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2019-12-31 00:00:00+00:00,0,USD,2767300000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
2,0000034088-21-000012,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-12-31 00:00:00+00:00,0,USD,35221000000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
3,0000034088-21-000012,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2019-12-31 00:00:00+00:00,0,USD,41831000000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
4,0000356037-21-000004,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-12-31 00:00:00+00:00,0,USD,8888000.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1


adsh                       string
cik                         int32
name                       string
LEI                        string
sic                         int32
countryba                  string
stprba                     string
cityba                     string
zipba                      string
bas1                       string
bas2                       string
baph                       string
countryma                  string
stprma                     string
cityma                     string
zipma                      string
mas1                       string
mas2                       string
countryinc                 string
stprinc                    string
ein                         int64
former                     string
changed                    string
afs                        string
wksi                         bool
fye                        string
form                       string
period        datetime64[ns, UTC]
fy                         string
fp            

Unnamed: 0,adsh,cik,name,LEI,sic,countryba,stprba,cityba,zipba,bas1,...,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,uuid,quarter
0,0000009389-21-000078,9389,BALL CORP,0BGI85ALH27ZJP15DY16,3411,US,CO,WESTMINSTER,80021,9200 W. 108TH CIRCLE,...,,2021-04-01 00:00:00+00:00,2021-04-01 10:49:00+00:00,True,True,bll-20210401x8k_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
1,0000037996-21-000020,37996,FORD MOTOR CO,20S05OYHG0MQM4VUIC57,3711,US,MI,DEARBORN,48126,ONE AMERICAN ROAD,...,,2021-04-01 00:00:00+00:00,2021-04-01 15:38:00+00:00,True,True,f-20210401_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
2,0000038009-21-000033,38009,FORD MOTOR CREDIT CO LLC,UDSQCVRUX5BONN0VY111,6159,US,MI,DEARBORN,48126,ONE AMERICAN ROAD,...,,2021-04-01 00:00:00+00:00,2021-04-01 16:49:00+00:00,True,True,fmcc-20210401_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
3,0000038725-21-000080,38725,FRANKLIN ELECTRIC CO INC,RR6AURIKMXUE4Q24G284,3621,US,IN,FORT WAYNE,46809,9255 COVERDALE ROAD,...,,2021-04-01 00:00:00+00:00,2021-04-01 15:03:00+00:00,True,True,fele-20210401_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
4,0000048732-21-000004,48732,CENTERPOINT ENERGY HOUSTON ELECTRIC LLC,,4911,US,TX,HOUSTON,77002,1111 LOUISIANA,...,,2021-04-01 00:00:00+00:00,2021-04-01 16:35:00+00:00,True,True,cehe-20210401_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2


adsh                     string
tag                      string
version                  string
coreg                    string
ddate       datetime64[ns, UTC]
qtrs                      int32
uom                      string
value                   float64
footnote                 string
uuid                     string
quarter                  string
dtype: object


Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote,uuid,quarter
0,0001640334-21-000798,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2021-01-31 00:00:00+00:00,0,USD,10010.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
1,0001640334-21-000798,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-10-31 00:00:00+00:00,0,USD,10913.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
2,0001477932-21-002126,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2020-12-31 00:00:00+00:00,0,USD,2372072.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
3,0001477932-21-002126,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2019-12-31 00:00:00+00:00,0,USD,1018145.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
4,0001640334-21-000810,AccountsPayableAndAccruedLiabilitiesCurrent,us-gaap/2019,,2021-02-28 00:00:00+00:00,0,USD,2974.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2


adsh                       string
cik                         int32
name                       string
LEI                        string
sic                         int32
countryba                  string
stprba                     string
cityba                     string
zipba                      string
bas1                       string
bas2                       string
baph                       string
countryma                  string
stprma                     string
cityma                     string
zipma                      string
mas1                       string
mas2                       string
countryinc                 string
stprinc                    string
ein                         int64
former                     string
changed                    string
afs                        string
wksi                         bool
fye                        string
form                       string
period        datetime64[ns, UTC]
fy                         string
fp            

Unnamed: 0,adsh,cik,name,LEI,sic,countryba,stprba,cityba,zipba,bas1,...,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,uuid,quarter
0,0000003570-21-000074,3570,CHENIERE ENERGY INC,MIHC87W9WTYSYZWV1J40,4924,US,TX,HOUSTON,77002,700 MILAM ST.,...,,2021-07-01 00:00:00+00:00,2021-07-01 16:33:00+00:00,True,True,lng-20210701_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
1,0000006845-21-000020,6845,"APOGEE ENTERPRISES, INC.",549300579S7QW8V4IJ60,3231,US,MN,MINNEAPOLIS,55435,4400 WEST 78TH STREET,...,Q1,2021-07-01 00:00:00+00:00,2021-07-01 15:19:00+00:00,True,True,apog-20210529_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
2,0000008947-21-000132,8947,AZZ INC,549300HD1X2NFLWPYC83,3640,US,TX,FORT WORTH,76107,"ONE MUSEUM PLACE, SUITE 500",...,,2021-07-01 00:00:00+00:00,2021-07-01 16:34:00+00:00,True,True,azz-20210701_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
3,0000012927-21-000056,12927,BOEING CO,RVHJWBXLJ1RFUBSY1F30,3721,US,WA,SEATTLE,98124,P O BOX 3707 MS 1F 31,...,,2021-07-01 00:00:00+00:00,2021-07-01 11:16:00+00:00,True,True,ba-20210629_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
4,0000022606-21-000005,22606,COMMONWEALTH EDISON CO,D8YOXB8L2ZDC8P7Q0771,4911,US,IL,CHICAGO,60605-1028,440 S LASALLE STREET,...,,2021-07-01 00:00:00+00:00,2021-07-01 16:20:00+00:00,True,True,exc-20210701_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3


adsh                     string
tag                      string
version                  string
coreg                    string
ddate       datetime64[ns, UTC]
qtrs                      int32
uom                      string
value                   float64
footnote                 string
uuid                     string
quarter                  string
dtype: object


Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote,uuid,quarter
0,0001477932-21-004399,NetIncomeLoss,us-gaap/2020,,2020-12-31 00:00:00+00:00,2,USD,-2701848.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
1,0001477932-21-004399,NetIncomeLoss,us-gaap/2020,,2020-12-31 00:00:00+00:00,1,USD,-356701.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
2,0001477932-21-004399,NetIncomeLoss,us-gaap/2020,,2020-09-30 00:00:00+00:00,1,USD,-2345147.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
3,0001477932-21-004399,NetIncomeLoss,us-gaap/2020,,2019-12-31 00:00:00+00:00,2,USD,-505994.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
4,0001477932-21-004399,NetIncomeLoss,us-gaap/2020,,2019-12-31 00:00:00+00:00,1,USD,-197097.0,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3


In [18]:
ingest_table = 'sub'

sql=f"""
select snapshot_id, committed_at from {ingest_catalog}.{ingest_schema}."{ingest_table}$snapshots"
    order by committed_at desc
    limit 5
"""
qres = engine.execute(sql)
snapshots = qres.fetchall()
snapshots

[(5838835114870883341, '2021-11-26 15:28:09.388 UTC'),
 (3316871770735269667, '2021-11-26 15:28:04.292 UTC'),
 (6147396229418705463, '2021-11-26 15:27:58.926 UTC'),
 (4659868933326989205, '2021-11-26 15:27:52.143 UTC'),
 (7014978886316893697, '2021-11-26 15:27:50.999 UTC')]

In [19]:
previous_snapshot = snapshots[1][0]
previous_snapshot

3316871770735269667

In [21]:
ingest_table

'sub'

In [20]:
sql=f"""
call {ingest_catalog}.system.rollback_to_snapshot('{ingest_schema}', '{ingest_table}', {previous_snapshot})
"""
qres = engine.execute(sql)
print(qres.fetchall())

TrinoQueryError: TrinoQueryError(type=INTERNAL_ERROR, name=PROCEDURE_CALL_FAILED, message="Cannot roll back to unknown snapshot id: 3316871770735269667", query_id=20211126_152827_00399_wtvpf)

In [22]:
sql=f"""
select * from {ingest_catalog}.{ingest_schema}.{ingest_table}
"""
pd.read_sql(sql, engine)

Unnamed: 0,adsh,cik,name,lei,sic,countryba,stprba,cityba,zipba,bas1,...,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks,uuid,quarter
0,0000003570-21-000074,3570,CHENIERE ENERGY INC,MIHC87W9WTYSYZWV1J40,4924,US,TX,HOUSTON,77002,700 MILAM ST.,...,,2021-07-01 00:00:00.000,2021-07-01 16:33:00.000,True,True,lng-20210701_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
1,0000009389-21-000078,9389,BALL CORP,0BGI85ALH27ZJP15DY16,3411,US,CO,WESTMINSTER,80021,9200 W. 108TH CIRCLE,...,,2021-04-01 00:00:00.000,2021-04-01 10:49:00.000,True,True,bll-20210401x8k_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q2
2,0000002178-21-000005,2178,"ADAMS RESOURCES & ENERGY, INC.",UP28TXL7K8XSRNNC7V02,5172,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,...,,2021-02-19 00:00:00.000,2021-02-19 07:10:00.000,True,True,ae-20210219_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q1
3,0000006845-21-000020,6845,"APOGEE ENTERPRISES, INC.",549300579S7QW8V4IJ60,3231,US,MN,MINNEAPOLIS,55435,4400 WEST 78TH STREET,...,Q1,2021-07-01 00:00:00.000,2021-07-01 15:19:00.000,True,True,apog-20210529_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
4,0000008947-21-000132,8947,AZZ INC,549300HD1X2NFLWPYC83,3640,US,TX,FORT WORTH,76107,"ONE MUSEUM PLACE, SUITE 500",...,,2021-07-01 00:00:00.000,2021-07-01 16:34:00.000,True,True,azz-20210701_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
5,0000012927-21-000056,12927,BOEING CO,RVHJWBXLJ1RFUBSY1F30,3721,US,WA,SEATTLE,98124,P O BOX 3707 MS 1F 31,...,,2021-07-01 00:00:00.000,2021-07-01 11:16:00.000,True,True,ba-20210629_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
6,0000022606-21-000005,22606,COMMONWEALTH EDISON CO,D8YOXB8L2ZDC8P7Q0771,4911,US,IL,CHICAGO,60605-1028,440 S LASALLE STREET,...,,2021-07-01 00:00:00.000,2021-07-01 16:20:00.000,True,True,exc-20210701_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
7,0000038777-21-000127,38777,FRANKLIN RESOURCES INC,RIFQSET379FOGTEFKS80,6282,US,CA,SAN MATEO,94403,ONE FRANKLIN PARKWAY,...,,2021-07-01 00:00:00.000,2021-07-01 16:08:00.000,True,True,ben-20210629_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
8,0000045012-21-000036,45012,HALLIBURTON CO,ENYF8GB5SMQZ25S06U51,1389,US,TX,HOUSTON,77032,3000 NORTH SAM HOUSTON PARKWAY EAST,...,,2021-07-01 00:00:00.000,2021-07-01 07:37:00.000,True,True,form8k_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3
9,0000049600-21-000086,49600,EASTGROUP PROPERTIES INC,31TIGQQZC4P6JMHKSW85,6798,US,MS,RIDGELAND,39157,400 W. PARKWAY PLACE,...,,2021-07-01 00:00:00.000,2021-07-01 17:23:00.000,True,True,egp-20210629_htm.xml,1,,3e498700-b325-4ec7-90f1-f230d45af98a,2021q3


Create metadata table for schema / dataset level information

In [None]:
# declare variable names for metadata structure in Trino
meta_schema_name = 'metastore_iceberg'
meta_table_name_dataset = 'meta_tables_iceberg'
meta_table_name_fields = 'meta_fields_iceberg'

In [None]:
custom_meta_content = {
    'dataset_key': 'SEC-DERA',
    'title': 'SEC DERA Disclosures',
    'description': 
    '''The DERA Financial Statement Data Sets provide numeric information from the face financials of all financial statements.
    
    This data is extracted from exhibits to corporate financial reports filed with the Commission using eXtensible Business Reporting Language (XBRL).  As compared to the more extensive Financial Statement and Notes Data Sets, which provide the numeric and narrative disclosures from all financial statements and their notes, the Financial Statement Data Sets are more compact.''',
    'version': '2020q4',
    'release_date': '20201231',
    'fields': [
    {
        'adsh':'Accession Number. The 20-character string formed from the 18-digit number assigned by the SEC to each EDGAR submission.',
        'cik':'Central Index Key (CIK). Ten digit number assigned by the SEC to each registrant that submits filings.',
        'name':'Name of registrant. This corresponds to the name of the legal entity as recorded in EDGAR as of the filing date.',
        'sic':'Standard Industrial Classification (SIC). Four digit code assigned by the SEC as of the filing date, indicating the registrant’s type of business.',
        'countryba':'The ISO 3166-1 country of the registrant’s business address.',
        'stprba':'The state or province of the registrant’s business address, if field countryba is US or CA.',
        'cityba':'The city of the registrant’s business address.',
        'zipba':'The zip code of the registrant’s business address.',
        'bas1':'The first line of the street of the registrant’s business address.',
        'bas2':'The second line of the street of the registrant’s business address.',
        'baph':'The phone number of the registrant’s business address.',
        'countryma':'The ISO 3166-1 country of the registrant’s mailing address.',
        'stprma':'The state or province of the registrant’s mailing address, if field countryma is US or CA.',
        'cityma':'The city of the registrant’s mailing address.',
        'zipma':'The zip code of the registrant’s mailing address.',
        'mas1':'The first line of the street of the registrant’s mailing address.',
        'mas2':'The second line of the street of the registrant’s mailing address.',
        'countryinc':'The country of incorporation for the registrant.',
        'stprinc':'The state or province of incorporation for the registrant, if countryinc is US or CA.',
        'ein':'Employee Identification Number, 9 digit identification number assigned by the Internal Revenue Service to business entities operating in the United States.',
        'former':'Most recent former name of the registrant, if any.',
        'changed':'Date of change from the former name, if any.',
        'afs':'Filer status with the SEC at the time of submission:\n\
1-LAF=Large Accelerated,\n\
2-ACC=Accelerated,\n\
3-SRA=Smaller Reporting Accelerated,\n\
4-NON=Non-Accelerated,\n\
5-SML=Smaller Reporting Filer,\n\
NULL=not assigned.',
        'wksi':'Well Known Seasoned Issuer (WKSI). An issuer that meets specific SEC requirements at some point during a 60-day period preceding the date the issuer satisfies its obligation to update its shelf registration statement.',
        'fye':'Fiscal Year End Date, rounded to nearest month-end.',
        'form':'The submission type of the registrant’s filing.',
        'period':'Balance Sheet Date, rounded to nearest month-end.',
        'fy':'Fiscal Year Focus (as defined in EFM Ch. 6).',
        'fp':'Fiscal Period Focus (as defined in EFM Ch. 6) within Fiscal Year. The 10-Q for the 1st, 2nd and 3rd quarters would have a fiscal period focus of Q1, Q2 (or H1), and Q3 (or M9) respectively, and a 10-K would have a fiscal period focus of FY.',
        'filed':'The date of the registrant’s filing with the Commission.',
        'accepted':'The acceptance date and time of the registrant’s filing with the Commission. Filings accepted after 5:30pm EST are considered filed on the following business day.',
        'prevrpt':'Previous Report –TRUE indicates that the submission information was subsequently amended.',
        'detail':'TRUE indicates that the XBRL submission contains quantitative disclosures within the footnotes and schedules at the required detail level (e.g., each amount).',
        'instance':'The name of the submitted XBRL Instance Document (EX-101.INS) type data file. The name often begins with the company ticker symbol.',
        'nciks':'Number of Central Index Keys (CIK) of registrants (i.e., business units) included in the consolidating entity’s submitted filing.',
        'aciks':'Additional CIKs of co-registrants included in  a consolidating entity’s EDGAR submission, separated by spaces. If there are no other co-registrants (i.e., nciks=1), the value of aciks is NULL.  For a very small number of filers, the list of co-registrants is too long to fit in the field.  Where this is the case, PARTIAL will appear at the end of the list indicating that not all co-registrants’ CIKs are included in the field; users should refer to the complete submission file for all CIK information.'
    }]
}

Convert custom metadata content in json format into Pandas DataFrame

In [None]:
df_meta_fields = pd.json_normalize(custom_meta_content, record_path =['fields'], meta=['dataset_key']).convert_dtypes()
df_meta_fields

In [None]:
df_meta_fields.info(verbose=True)

In [None]:
df_meta_table = pd.json_normalize(custom_meta_content, max_level=0)
df_meta_table.drop('fields', inplace=True, axis=1)
df_meta_table['schema'] = schemaname
df_meta_table = df_meta_table.convert_dtypes()
df_meta_table

In [None]:
df_meta_table.info(verbose=True)

In [None]:
schema_check = engine.execute(f'create schema if not exists {ingest_catalog}.{meta_schema_name}')
for row in schema_check.fetchall():
    print(row)

In [None]:
table_check = engine.execute(f'drop table if exists {ingest_catalog}.{meta_schema_name}.{meta_table_name_dataset}')
for row in table_check.fetchall():
    print(row)

In [None]:
schema_meta_table = create_table_schema_pairs(df_meta_table)
tabledef = f"""
create table if not exists {ingest_catalog}.{meta_schema_name}.{meta_table_name_dataset}(
{schema_meta_table}
) with (
    format = 'ORC',
    partitioning = array['dataset_key']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())

In [None]:
meta_table_create = engine.execute(tabledef)
for row in meta_table_create.fetchall():
    print(row)

In [None]:
df_meta_table.to_sql(meta_table_name_dataset,
                     con=engine,
                     schema=meta_schema_name,
                     if_exists='append',
                     index=False,
                     method='multi')

list_values_meta_table = df_meta_table.values.tolist()
list_values_meta_table[0]

In [None]:
meta_query_table = (f'SELECT * FROM {ingest_catalog}.{meta_schema_name}.{meta_table_name_dataset} limit 10')
print(meta_query_table)
meta_table_query = engine.execute(meta_query_table)
for row in meta_table_query.fetchall():
    print(row)

Create metadata table for fields information

In [None]:
table_check = engine.execute(f'drop table if exists {ingest_catalog}.{meta_schema_name}.{meta_table_name_fields}')
for row in table_check.fetchall():
    print(row)

In [None]:
schema_meta_fields = create_table_schema_pairs(df_meta_fields)
tabledef = f"""
create table if not exists {ingest_catalog}.{meta_schema_name}.{meta_table_name_fields}(
{schema_meta_fields}
) with (
    format = 'ORC',
    partitioning = array['dataset_key']
)
"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())

In [None]:
meta_fields_create = engine.execute(tabledef)
for row in meta_fields_create.fetchall():
    print(row)

In [None]:
df_meta_fields.to_sql(meta_table_name_fields,
                      con=engine,
                      schema=meta_schema_name,
                      if_exists='append',
                      index=False,
                      method='multi')

list_values_meta_fields = df_meta_fields.values.tolist()
list_values_meta_fields[0]

In [None]:
meta_query_fields = (f'SELECT * FROM {ingest_catalog}.{meta_schema_name}.{meta_table_name_fields} limit 10')
print(meta_query_table)
meta_table_query = engine.execute(meta_query_table)
for row in meta_table_query.fetchall():
    print(row)