# Begin with Credentials and Connection to Trino

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

Set session variable CATALOG to make query terms much more compact

In [2]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

import pandas as pd

In [3]:
from osc_ingest_trino import *

trino_bucket = attach_s3_bucket("S3_DEV")

In [4]:
qres = engine.execute("""
create or replace view sec_dera.fy_revenue_by_lei as
select name, lei, fy, N.ddate, sic, max(value) as tot_revenue
from sec_dera.sub as S join sec_dera.num as N on S.adsh=N.adsh and N.ddate>=S.fy and date_add('year', 1, S.fy)>N.ddate
  -- left join sec_dera.ticker T on S.cik=T.cik
where S.fp='FY' and (S.form='10-K' or S.form='20-F')
and coreg is NULL
and qtrs=4
and uom='USD'
and (N.tag='Revenues'
     or N.tag='RevenueFromContractWithCustomerIncludingAssessedTax'
     or N.tag='RevenueFromContractWithCustomerExcludingAssessedTax'
     or N.tag='RevenuesNetOfInterestExpense'
     or N.tag='RegulatedAndUnregulatedOperatingRevenue'
     or N.tag='RegulatedOperatingRevenuePipelines')
group by name, lei, fy, N.ddate, sic
""")
display(qres.fetchall())

qres = engine.execute("""
select count (*) from sec_dera.fy_revenue_by_lei
""")
display(qres.fetchall())

qres = engine.execute("""
select * from sec_dera.fy_revenue_by_lei limit 10
""")
display(qres.fetchall())

[(True,)]

[(9673,)]

[('ORGANICELL REGENERATIVE MEDICINE, INC.', None, '2019-01-01 00:00:00.000', '2019-10-31 00:00:00.000', 3790, 1702271.0),
 ('BRIGHTVIEW HOLDINGS, INC.', '549300F8QN8YHVI7L866', '2020-01-01 00:00:00.000', '2020-09-30 00:00:00.000', 700, 2346000000.0),
 ('HP INC', 'WHKXQACZ14C5XRO8LW03', '2020-01-01 00:00:00.000', '2020-10-31 00:00:00.000', 3570, 56639000000.0),
 ('CARNEGIE DEVELOPMENT, INC', '549300JMXKV7IFUEKB68', '2019-01-01 00:00:00.000', '2019-12-31 00:00:00.000', 3510, None),
 ('MDJM LTD', None, '2019-01-01 00:00:00.000', '2019-12-31 00:00:00.000', 6531, 5679977.0),
 ('RALPH LAUREN CORP', 'HO1QNWM0IXBZ0QSMMO20', '2019-01-01 00:00:00.000', '2019-03-31 00:00:00.000', 2320, 6313000000.0),
 ('OLIVIA VENTURES, INC.', None, '2019-01-01 00:00:00.000', '2019-03-31 00:00:00.000', 6770, None),
 ('MAXIMUS, INC.', '549300DQCDS8HJ7QF202', '2020-01-01 00:00:00.000', '2020-09-30 00:00:00.000', 7389, 3461537000.0),
 ('APPSOFT TECHNOLOGIES, INC.', None, '2019-01-01 00:00:00.000', '2019-12-31 00:00:

In [5]:
qres = engine.execute("""
create or replace view sec_dera.float_by_lei as
select name, lei, ddate, sic, max(value) as market_cap
from sec_dera.sub as S join sec_dera.num as N on S.adsh=N.adsh and N.ddate>=S.fy and date_add('year', 1, S.fy)>N.ddate
  -- join sec_dera.ticker T on S.cik=T.cik
where S.fp='FY' and (S.form='10-K' or S.form='20-F')
and coreg is NULL
and qtrs=0
and uom='USD'
and (tag='EntityPublicFloat'
     or tag='EntitysPublicFloat'
     or tag='FreeFloat'
     or tag='PublicFloat'
     or tag='PublicFloatValue')
group by name, lei, ddate, sic
""")
display(qres.fetchall())

qres = engine.execute("""
select count (*) from sec_dera.float_by_lei
""")
display(qres.fetchall())

qres = engine.execute("""
select * from sec_dera.float_by_lei limit 10
""")
display(qres.fetchall())

[(True,)]

[(9942,)]

[('HORNBECK OFFSHORE SERVICES INC', None, '2019-06-30 00:00:00.000', 4400, 43514494.0),
 ('GRANITESHARES PLATINUM TRUST', None, '2020-06-30 00:00:00.000', 6221, 9031000.0),
 ('VIEW SYSTEMS INC', None, '2019-06-30 00:00:00.000', 7380, 707585.0),
 ('BLUBUZZARD, INC.', None, '2019-06-30 00:00:00.000', 6770, 150200.0),
 ('PACIFIC SPORTS EXCHANGE INC.', None, '2020-02-29 00:00:00.000', 5940, 31000.0),
 ('ROAD MARSHALL, INC.', None, '2020-03-31 00:00:00.000', 7371, 500.0),
 ('MCCORMICK & CO INC', None, '2019-05-31 00:00:00.000', 2090, 20658784327.0),
 ('FIRSTCASH, INC', '549300V2X3UU7K0DG350', '2019-06-30 00:00:00.000', 5900, 3102000000.0),
 ('WELLTOWER INC.', 'T6IZ0MBEG5ACZDTR7D06', '2019-06-30 00:00:00.000', 6798, 32986689000.0),
 ('YETI HOLDINGS, INC.', '549300DSQICIGNT5GO63', '2019-06-30 00:00:00.000', 3949, 1137608546.0)]

In [6]:
qres = engine.execute("""
create or replace view sec_dera.cash_by_lei as
select name, lei, ddate, sic, max(value) as cash
from sec_dera.sub as S join sec_dera.num as N on S.adsh=N.adsh and N.ddate>=S.fy and date_add('year', 1, S.fy)>N.ddate
  -- join sec_dera.ticker T on S.cik=T.cik
where S.fp='FY' and (S.form='10-K' or S.form='20-F')
and coreg is NULL
and qtrs=0
and uom='USD'
and (tag='CashAndCashEquivalentsAtCarryingValue' or tag='Cash' or tag='CashEquivalentsAtCarryingValue' or tag='CashAndCashEquivalents')
group by name, lei, ddate, sic
""")
display(qres.fetchall())

qres = engine.execute("""
select count (*) from sec_dera.cash_by_lei
""")
display(qres.fetchall())

qres = engine.execute("""
select * from sec_dera.cash_by_lei limit 10
""")
display(qres.fetchall())

[(True,)]

[(11475,)]

[('EASTGROUP PROPERTIES INC', '31TIGQQZC4P6JMHKSW85', '2019-12-31 00:00:00.000', 6798, 224000.0),
 ('UNITEDHEALTH GROUP INC', '549300GHBMY8T5GXDE41', '2019-12-31 00:00:00.000', 6324, 10985000000.0),
 ('CHURCH & DWIGHT CO INC', 'POOXSI30AWAQGYJZC921', '2019-12-31 00:00:00.000', 2840, 155700000.0),
 ('UNIVERSAL STAINLESS & ALLOY PRODUCTS INC', '5493001OEIZDUGXZDE09', '2019-12-31 00:00:00.000', 3312, 170000.0),
 ('PUBLIC SERVICE CO OF COLORADO', '5BANEF6PGSEK0SNMQF44', '2019-12-31 00:00:00.000', 4931, 11400000.0),
 ('ENCORE WIRE CORP', None, '2019-12-31 00:00:00.000', 3350, 230965000.0),
 ('COCA COLA CO', 'UWJKFUJFZ02DKWI3RY53', '2019-12-31 00:00:00.000', 2080, 6480000000.0),
 ('GAIA, INC', '529900QUX8SBA2D93M92', '2019-12-31 00:00:00.000', 7812, 11494000.0),
 ('HAMILTON BEACH BRANDS HOLDING CO', '549300D746EZ23QVEB39', '2019-12-31 00:00:00.000', 3634, 2142000.0),
 ('DORCHESTER MINERALS, L.P.', '529900J537JTEZOMXG05', '2019-12-31 00:00:00.000', 1311, 15339000.0)]

In [8]:
qres = engine.execute("""
create or replace view sec_dera.debt_by_lei as
select name, lei, ddate, sic, max(value) as debt
from sec_dera.sub as S join sec_dera.num as N on S.adsh=N.adsh and N.ddate>=S.fy and date_add('year', 1, S.fy)>N.ddate
  -- join sec_dera.ticker T on S.cik=T.cik
where S.fp='FY' and (S.form='10-K' or S.form='20-F')
and coreg is NULL
and qtrs=0
and uom='USD'
and (tag='LongTermDebt' or tag='LongTermDebtFairValue' or tag='LongTermDebtAndCapitalLeaseObligations'
     or tag='DebtLongtermAndShorttermCombinedAmount' or tag='SecuredDebt' or tag='UnsecuredDebt'
     or tag='SubordinatedDebt' or tag='ConvertibleDebt')
group by name, lei, ddate, sic
""")
display(qres.fetchall())

qres = engine.execute("""
select count (*) from sec_dera.debt_by_lei
""")
display(qres.fetchall())

qres = engine.execute("""
select * from sec_dera.debt_by_lei limit 10
""")
display(qres.fetchall())

[(True,)]

[(5580,)]

[('FOUNTAIN HEALTHY AGING, INC.', None, '2020-12-31 00:00:00.000', 4700, 63506.0),
 ('CAPITAL PRODUCT PARTNERS L.P.', None, '2020-12-31 00:00:00.000', 4412, 374324000.0),
 ('NICHOLAS FINANCIAL INC', '5493005OYHXSC9R6FU12', '2019-03-31 00:00:00.000', 6153, 142619000.0),
 ('AIR T INC', '549300XSFJNDOT088737', '2019-03-31 00:00:00.000', 4513, 57653000.0),
 ('UGI CORP', 'DX6GCWD4Q1JO9CRE5I40', '2020-09-30 00:00:00.000', 4932, 6034000000.0),
 ('GENCOR INDUSTRIES INC', '549300X3YEO4XOYOD178', '2020-09-30 00:00:00.000', 3531, 0.0),
 ('SCANSOURCE, INC.', '7FHBAN017QDY4K2KP156', '2020-06-30 00:00:00.000', 5045, 218728000.0),
 ('IRON MOUNTAIN INC', 'SQL3F6CKNNBM3SQGHX24', '2020-12-31 00:00:00.000', 6798, 8703314000.0),
 ('BROOKLINE BANCORP INC', None, '2020-12-31 00:00:00.000', 6035, 820247000.0),
 ('VALUE EXCHANGE INTERNATIONAL, INC.', None, '2020-12-31 00:00:00.000', 7380, 62949.0)]

In [11]:
qres = engine.execute("""
create or replace view sec_dera.assets_by_lei as
select name, lei, ddate, sic, value as assets
from sec_dera.sub as S join sec_dera.num as N on S.adsh=N.adsh and N.ddate>=S.fy and date_add('year', 1, S.fy)>N.ddate
  -- join sec_dera.ticker T on S.cik=T.cik
where S.fp='FY' and (S.form='10-K' or S.form='20-F')
-- and S.lei='I1BZKREC126H0VB1BL91'
and coreg is NULL
and qtrs=0
and uom='USD'
and tag='Assets'
""")
display(qres.fetchall())

qres = engine.execute("""
select count (*) from sec_dera.assets_by_lei
""")
display(qres.fetchall())

qres = engine.execute("""
select * from sec_dera.assets_by_lei limit 10
""")
display(qres.fetchall())

[(True,)]

[(12003,)]

[('HURCO COMPANIES INC', 'XSC5XDOTBYOBQPOG2O82', '2019-10-31 00:00:00.000', 3823, 301065000.0),
 ('ACRO BIOMEDICAL CO., LTD.', None, '2019-09-30 00:00:00.000', 7900, 1069464.0),
 ('ADAIAH DISTRIBUTION INC', None, '2019-10-31 00:00:00.000', 7372, 0.0),
 ('ANIXA BIOSCIENCES INC', None, '2019-10-31 00:00:00.000', 8071, 6293693.0),
 ('TIMBERLINE RESOURCES CORP', None, '2019-09-30 00:00:00.000', 1040, 15505141.0),
 ('NEUBASE THERAPEUTICS, INC.', None, '2019-09-30 00:00:00.000', 2834, 12531397.0),
 ('SILVER BULL RESOURCES, INC.', None, '2019-10-31 00:00:00.000', 1000, 9205892.0),
 ('VOIP-PAL.COM INC', None, '2019-09-30 00:00:00.000', 3661, 2568186.0),
 ('NUKKLEUS INC.', '549300EOLUCX2ZYTFU74', '2019-09-30 00:00:00.000', 8742, 203001.0),
 ('ALL FOR ONE MEDIA CORP.', None, '2019-09-30 00:00:00.000', 7900, 280586.0)]

In [12]:
qres = engine.execute("""
create or replace view sec_dera.financials_by_lei as
select name, lei, fy, Nrevenue.ddate as r_ddate, sic,
       max(Nrevenue.value) as revenue,
       max(Ncash.value) as cash,
       max(Ndebt.value) as debt,
       max(Nassets.value) as assets,
       max(Nfloat.value) as market_cap
       -- max(Nfloat.value)+max(Ndebt.value)-max(Ncash.value) as ev,
       -- max(Nfloat.value)+max(Ndebt.value) as evic
from sec_dera.sub as S
      join sec_dera.num as Nrevenue on S.adsh=Nrevenue.adsh
           and Nrevenue.ddate>=S.fy and date_add('year', 1, S.fy)>Nrevenue.ddate
      join sec_dera.num as Nassets on S.adsh=Nassets.adsh and Nassets.ddate=Nrevenue.ddate
      join sec_dera.num as Ndebt on S.adsh=Ndebt.adsh and Ndebt.ddate=Nrevenue.ddate
      join sec_dera.num as Ncash on S.adsh=Ncash.adsh and Ncash.ddate=Nrevenue.ddate
      join sec_dera.num as Nfloat on S.adsh=Nfloat.adsh
where S.fp='FY' and (S.form='10-K' or S.form='20-F')
      -- and S.lei='I1BZKREC126H0VB1BL91'
and Nrevenue.qtrs=4
and Nrevenue.coreg is NULL
and Nrevenue.uom='USD'
and (Nrevenue.tag='Revenues'
     or Nrevenue.tag='RevenueFromContractWithCustomerIncludingAssessedTax'
     or Nrevenue.tag='RevenueFromContractWithCustomerExcludingAssessedTax'
     or Nrevenue.tag='RevenuesNetOfInterestExpense'
     or Nrevenue.tag='RegulatedAndUnregulatedOperatingRevenue'
     or Nrevenue.tag='RegulatedOperatingRevenuePipelines')
and Nfloat.qtrs=0 and Nfloat.ddate>=fy and date_add('year', 1, fy)>Nfloat.ddate
and Nfloat.coreg is NULL
and Nfloat.uom='USD'
and (Nfloat.tag='EntityPublicFloat'
     or Nfloat.tag='EntitysPublicFloat'
     or Nfloat.tag='FreeFloat'
     or Nfloat.tag='PublicFloat'
     or Nfloat.tag='PublicFloatValue')
and Ncash.qtrs=0
and Ncash.coreg is NULL
and Ncash.uom='USD'
and (Ncash.tag='CashAndCashEquivalentsAtCarryingValue'
     or Ncash.tag='Cash'
     or Ncash.tag='CashEquivalentsAtCarryingValue'
     or Ncash.tag='CashAndCashEquivalents')
and Ndebt.qtrs=0
and Ndebt.coreg is NULL
and Ndebt.uom='USD'
and (Ndebt.tag='LongTermDebt'
     or Ndebt.tag='LongTermDebtFairValue'
     or Ndebt.tag='LongTermDebtAndCapitalLeaseObligations'
     or Ndebt.tag='DebtLongtermAndShorttermCombinedAmount'
     or Ndebt.tag='SecuredDebt'
     or Ndebt.tag='UnsecuredDebt'
     or Ndebt.tag='SubordinatedDebt'
     or Ndebt.tag='ConvertibleDebt')
and Nassets.qtrs=0
and Nassets.coreg is NULL
and Nassets.uom='USD'
and Nassets.tag='Assets'
group by name, lei, fy, Nrevenue.ddate, sic
""")
display(qres.fetchall())

qres = engine.execute("""
select * from sec_dera.financials_by_lei
""")
l = qres.fetchall()

print(len(l))

[(True,)]

TrinoQueryError: TrinoQueryError(type=INTERNAL_ERROR, name=REMOTE_TASK_MISMATCH, message="Could not communicate with the remote task. The node may have crashed or be under too much load. This is probably a transient issue, so please retry your query in a few minutes. (10.129.7.18:8080)", query_id=20211213_150050_00035_85hmd)

In [None]:
columns = [x[0] for x in engine.execute("describe sec_dera.financials_by_lei").fetchall()]

In [None]:
df = pd.DataFrame(data=l, columns=columns)

In [None]:
df

In [None]:
df = df.convert_dtypes()

drop_unmanaged_table("osc_datacommons_dev", "sec_dera", "corp_data_df", engine, trino_bucket, verbose=True)

drop_unmanaged_data("sec_dera", "corp_data_df", trino_bucket, verbose=True)

ingest_unmanaged_parquet(df, "sec_dera", "corp_data_df", trino_bucket, partition_columns=[],
                         append=False, workdir='/tmp', verbose=True)

sql = unmanaged_parquet_tabledef(df, "osc_datacommons_dev", "sec_dera", "corp_data_df", trino_bucket, partition_columns = [],
                                 verbose=True)
qres = engine.execute(sql)
display(qres.fetchall())

qres = engine.execute("select * from sec_dera.corp_data_df")
l = qres.fetchall()

In [None]:
qres = engine.execute("""
explain select * from sec_dera.financials_by_lei
""")
print(qres.fetchall()[0][0])