PyCQuery is a collection of Python DB-API and SQLAlchemy interfaces for Hive 3.x. Other than PyHive, it supports features below.
- Dynamic service discovery through Zookeeper
- Kerberos authentication with pure-python Kerberos library, minikerberos
- Python 3.6 or above
- HiveServer2 instance for Hive 3.x
pip3 install PyCQuery
from pycquery import hive
cursor = hive.connect('localhost').cursor()
cursor.execute('SELECT * FROM my_data LIMIT 10')
print(cursor.fetchone())
print(cursor.fetchall())
cursor.close()
from pycquery import hive
from TCLIService.ttypes import TOperationState
cursor = hive.connect('localhost').cursor()
cursor.execute('SELECT * FROM my_data LIMIT 10', async_=True)
status = cursor.poll().operationState
while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE):
logs = cursor.fetch_logs()
for message in logs:
print(message)
# If needed, an asynchronous query can be cancelled at any time with:
# cursor.cancel()
status = cursor.poll().operationState
print(cursor.fetchall())
cursor.close()
First install this package to register it with SQLAlchemy (see setup.py
).
from sqlalchemy import *
from sqlalchemy.engine import create_engine
from sqlalchemy.schema import *
engine = create_engine('hive://localhost:10000/default')
logs = Table('my_data', MetaData(bind=engine), autoload=True)
print(select([func.count('*')], from_obj=logs).scalar())
Note: query generation functionality is not fully tested, but there should be no problem with raw SQL.
# DB-API
hive.connect('localhost', configuration={'hive.exec.reducers.max': '123'})
# SQLAlchemy
create_engine(
'hive://user@host:10000/database',
connect_args={'configuration': {'hive.exec.reducers.max': '123'}},
)
# SQLAlchemy with LDAP
create_engine(
'hive://user:password@host:10000/database',
connect_args={'auth': 'LDAP'},
)
- For Single HiveServer2 Instance
from pycquery import hive
from pycquery_krb.common import conf
params = {
'host': 'localhost',
'port': '10000',
'http_path': 'cliservice',
'service_mode': 'http',
'kerberos_service_name': 'hive/localhost',
'auth': 'KERBEROS',
'username': 'user1@EXAMPLE.COM',
'keytab_file': 'keytab-file-for-username',
'krb_conf': conf.KerberosConf.from_osenv() # load krb5.conf from the path set by KRB5_CONFIG or /etc/krb5.conf
# 'krb_conf': conf.KerberosConf.from_file('other-krb5-conf-file') # load krb5.conf from your path
}
if __name__ == "__main__":
conn = hive.connect(**params)
try:
cursor = conn.cursor()
try:
cursor.execute('SELECT * FROM my_data LIMIT 10')
print(cursor.fetchall())
finally:
cursor.close()
finally:
conn.close()
- For Multiple HiveServer2 Instances with Zookeeper Discovery
from pycquery import hive
from pycquery_krb.common import conf
params = {
'host': 'zk1.host, zk2.host, zk3.host', # list of zookeeper server hosts
'port': '2181', # zookeeper server port
'is_zookeeper': True,
'http_path': 'cliservice',
'service_mode': 'http',
'kerberos_service_name': 'hive/localhost',
'auth': 'KERBEROS',
'username': 'user1@EXAMPLE.COM',
'keytab_file': 'keytab-file-for-username',
'krb_conf': conf.KerberosConf.from_osenv() # load krb5.conf from the path set by KRB5_CONFIG or /etc/krb5.conf
# 'krb_conf': conf.KerberosConf.from_file('other-krb5-conf-file') # load krb5.conf from your path
}
if __name__ == "__main__":
conn = hive.connect(**params)
try:
cursor = conn.cursor()
try:
cursor.execute('SELECT * FROM my_data LIMIT 10')
print(cursor.fetchall())
finally:
cursor.close()
finally:
conn.close()
- It requires
sasl
andthrift-sasl
. The pure-python Kerberos library is not supported yet.
The TCLIService module is autogenerated using a TCLIService.thrift
file. To update it, the
generate.py
file can be used: python generate.py <TCLIServiceURL>
. When left blank, the
version for Hive 3.0 will be downloaded.
Copyright 2021-present NAVER Corp.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.