In [1]:
#pip install trino urllib3 pandas

In [2]:
# sqlalchemy-trino has an integration bug w/ trino api, possibly fixed but unreleased
# also currently requires sqlalchemy 1.3, not 1.4
# however if they get this fixed it might be another useful connection api
#pip install --upgrade sqlalchemy==1.3 sqlalchemy-trino

In [3]:
# This may not be compatible with installing sqlalchemy-trino
#pip install 'pyhive[presto,trino]'
# https://github.com/dropbox/PyHive

In [1]:
# two possible apis to generate a trino connection:
import trino
from pyhive import presto

# pandas dfs
import pandas as pd

import urllib3
urllib3.disable_warnings()

In [2]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [3]:
JWT_TOKEN = os.environ['TRINO_PASSWD']
conn = trino.dbapi.connect(
    host=os.environ['TRINO_HOST'],
    port=int(os.environ['TRINO_PORT']),
    user=os.environ['TRINO_USER'],
    http_scheme='https',
    auth=trino.auth.JWTAuthentication(JWT_TOKEN),
    verify=True,
)
cur = conn.cursor()

In [22]:
cur.execute('show catalogs')
cur.fetchall()

[['jmx'], ['osc_datacommons_dev'], ['osc_datacommons_prod'], ['system']]

In [28]:
cur.execute('show tables from hive')
cur.fetchall()

TrinoUserError: TrinoUserError(type=USER_ERROR, name=MISSING_CATALOG_NAME, message="line 1:1: Catalog must be specified when session catalog is not set", query_id=20210930_023633_00023_iz7p7)

In [5]:
# this doesn't work with connection from trino api
# but it will work with connection from pyhive.presto, see below
#catDF = pd.read_sql("select * from hive.team1.cat", conn)

In [6]:
cur.execute('select * from hive.team1.cat')
df = pd.DataFrame(cur.fetchall())

# hack to move column names out of data
df, df.columns = df[1:], df.iloc[0]
df.reset_index(drop=True)

# infer data types (these will be str)
df = df.convert_dtypes()

# convert length to int
df['length'] = pd.to_numeric(df.length)

df.head()

Unnamed: 0,word,length
1,the,3
2,cat,3
3,in,2
4,the,3
5,hat,3


In [7]:
# check column types
df.dtypes

word      string
length     int64
dtype: object

## Using a pyhive.presto connection

In [8]:
import pyhive.presto
conn = presto.connect(
    username = trino_user,
    password = trino_passwd,
    host = trino_host,
    port = trino_port,
    protocol = 'https'
)

In [9]:
# With pyhive.presto you can load directly from the connection,
# although resulting DF is same as from trino api
df = pd.read_sql("select * from hive.team1.cat", conn)
df.head()

Unnamed: 0,word,length
0,word,length
1,the,3
2,cat,3
3,in,2
4,the,3


## **** Below is not working
see:
https://github.com/dungdm93/sqlalchemy-trino/issues/23

In [None]:
sqlstring = 'trino://{user}:{passwd}@{host}:{port}/hive'.format(
    user = trino_user,
    passwd = trino_passwd,
    host=trino_host,
    port=trino_port
)
sqlstring

In [None]:
import sqlalchemy
engine = sqlalchemy.engine.create_engine(
    sqlstring)

In [None]:
df = pd.read_sql("select * from hive.team1.cat", engine)

In [None]:
import sqlalchemy

sqlstring = 'trino://{user}:{passwd}@{host}:{port}/hive'.format( \
    user = trino_user, \
    passwd = trino_passwd, \
    host=trino_host, \
    port=trino_port
)
engine = sqlalchemy.create_engine(sqlstring)

In [None]:
# https://github.com/dungdm93/sqlalchemy-trino/issues/23
pd.read_sql_query('select * from hive.team1.cat', sqlstring)