## Создание Hive-таблицы

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("from_hive") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/15 17:05:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df_load = spark.sql('show databases')
df_load.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [4]:
spark.stop()

Попробуем создать из этого файла несколько паркетников

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("csv_to_many_parquets") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

from pyspark.sql.types import StructType, StructField, IntegerType, StringType
schema = StructType([
    StructField("NUM", IntegerType(), True),
    StructField("NAME", StringType(), True),
    StructField("CITY", StringType(), True),
    StructField("STATE", StringType(), True),
    StructField("ID", IntegerType(), True),
])

df = spark.read.option("header", "false") \
    .schema(schema) \
    .csv("hdfs://namenode:9000/data/openbeer/breweries/breweries.csv")

df = df.repartition(4)
df.write.mode("overwrite").parquet("hdfs://namenode:9000/data/openbeer/breweries_parquet")

                                                                                

Теперь создадим таблицу в hive

In [6]:
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS breweries_parquet(
    NUM INT,
    NAME STRING,
    CITY STRING,
    STATE STRING,
    ID INT
)
STORED AS PARQUET
LOCATION 'hdfs://namenode:9000/data/openbeer/breweries_parquet'
""")

25/10/15 17:05:39 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


DataFrame[]

In [7]:
spark.sql("SHOW TABLES").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  default|breweries_parquet|      false|
+---------+-----------------+-----------+



In [8]:
spark.sql("SELECT * FROM default.breweries_parquet").show()

+---+--------------------+------------------+-----+---+
|NUM|                NAME|              CITY|STATE| ID|
+---+--------------------+------------------+-----+---+
| 77|    Blue Owl Brewing|            Austin|   TX| 77|
|515|Crabtree Brewing ...|           Greeley|   CO|515|
|222|Carton Brewing Co...|Atlantic Highlands|   NJ|222|
|512|    Cottrell Brewing|         Pawcatuck|   CT|512|
|332|La Cumbre Brewing...|       Albuquerque|   NM|332|
|425|           Ciderboys|     Stevens Point|   WI|425|
|483|Bale Breaker Brew...|            Yakima|   WA|483|
|291|Tommyknocker Brewery|     Idaho Springs|   CO|291|
|350|Central Coast Bre...|   San Luis Obispo|   CA|350|
|334|The Traveler Beer...|        Burlington|   VT|334|
|327|Half Acre Beer Co...|           Chicago|   IL|327|
|299|Matt Brewing Company|             Utica|   NY|299|
|311|Dirty Bucket Brew...|       Woodinville|   WA|311|
|479|     Heavy Seas Beer|        Halethorpe|   MD|479|
|528|Asheville Brewing...|         Asheville|   

In [9]:
spark.stop()

## HDFS(Hive) -> ClickHouse

In [3]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("from_hive_to_clickhouse")
    .master("spark://spark-master:7077")
    # hive confs
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .enableHiveSupport()
    # clickhouse confs
    .config(
        "spark.jars.packages",
        ",".join([
            "com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0",
            "com.clickhouse:clickhouse-client:0.7.0",
            "com.clickhouse:clickhouse-http-client:0.7.0",
            "org.apache.httpcomponents.client5:httpclient5:5.2.1",
        ])
    )
    .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
    .config("spark.sql.catalog.clickhouse.host", "clickhouse-server")
    .config("spark.sql.catalog.clickhouse.protocol", "http")
    .config("spark.sql.catalog.clickhouse.http_port", "8123")
    .config("spark.sql.catalog.clickhouse.user", "default")
    .config("spark.sql.catalog.clickhouse.password", "1234qwe")
    .config("spark.sql.catalog.clickhouse.database", "default")
    .config("spark.clickhouse.write.format", "json")
    .getOrCreate()
)



:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
com.clickhouse.spark#clickhouse-spark-runtime-3.4_2.12 added as a dependency
com.clickhouse#clickhouse-client added as a dependency
com.clickhouse#clickhouse-http-client added as a dependency
org.apache.httpcomponents.client5#httpclient5 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f27a855e-a5cd-4187-b5ce-9398c363148e;1.0
	confs: [default]
	found com.clickhouse.spark#clickhouse-spark-runtime-3.4_2.12;0.8.0 in central
	found com.clickhouse#clickhouse-client;0.7.0 in central
	found com.clickhouse#clickhouse-data;0.7.0 in central
	found com.clickhouse#clickhouse-http-client;0.7.0 in central
	found org.apache.httpcomponents.client5#httpclient5;5.2.1 in central
	found org.apache.httpcomponents.core5#httpcore5;5.2 in central
	found org.apache.httpcomponents.core5#httpcore5-h2;5.2 in central
	found org.slf4j#slf4j-api;1.7.36 in central
:: r

In [4]:
df = spark.sql("SELECT * FROM default.breweries_parquet")
df.printSchema()

root
 |-- NUM: integer (nullable = true)
 |-- NAME: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- ID: integer (nullable = true)



ClickHouse чувствителен к регистрам, поэтому имена колонок должны совпадать

In [5]:
spark.sql("""
CREATE TABLE IF NOT EXISTS clickhouse.default.test_table_2 (
    ID INT NOT NULL,
    NAME STRING,
    CITY STRING
)
USING clickhouse
TBLPROPERTIES (
    'engine'='MergeTree()',
    'order_by'='ID'
)
""")

DataFrame[]

In [6]:
df.writeTo("clickhouse.default.test_table_2").append()

25/10/15 17:59:45 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/10/15 18:00:00 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/10/15 18:00:15 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

In [7]:
df = spark.sql("select * from clickhouse.default.test_table_2")
df.show()

                                                                                

+---+--------------------+--------------+
| ID|                NAME|          CITY|
+---+--------------------+--------------+
|  0|  NorthGate Brewing |   Minneapolis|
|  1|Against the Grain...|    Louisville|
|  2|Jack's Abby Craft...|    Framingham|
|  5|COAST Brewing Com...|    Charleston|
|  7|    Tapistry Brewing|      Bridgman|
| 15|Founders Brewing ...|  Grand Rapids|
| 17|Tin Man Brewing C...|    Evansville|
| 19|   Brew Link Brewing|    Plainfield|
| 21| Three Pints Brewing|  Martinsville|
| 23|Indiana City Brewing|  Indianapolis|
| 24|    Burn 'Em Brewing| Michigan City|
| 26|  Evil Czech Brewery|     Mishawaka|
| 27|450 North Brewing...|      Columbus|
| 29| Cedar Creek Brewery|  Seven Points|
| 31|Boulevard Brewing...|   Kansas City|
| 32|James Page Brewin...| Stevens Point|
| 34|Ballast Point Bre...|     San Diego|
| 35|Anchor Brewing Co...| San Francisco|
| 39|Gonzo's BiggDogg ...|     Kalamazoo|
| 41| Lost Nation Brewing|East Fairfield|
+---+--------------------+--------