In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HiveTest") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/09 14:48:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/09 14:48:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df_load = spark.sql('show databases')
df_load.show()

+---------+
|namespace|
+---------+
|  default|
| openbeer|
+---------+



Попробуем создать из этого файла несколько паркетников

In [1]:
from pyspark.sql import SparkSession

# создаём SparkSession с Hive и HDFS
spark = SparkSession.builder \
    .appName("CSV_to_Parquet") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

# читаем CSV из HDFS
df = spark.read.option("header", "false") \
    .option("inferSchema", "true") \
    .csv("hdfs://namenode:9000/data/openbeer/breweries/breweries.csv")

# если нужно задать схему явно
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

schema = StructType([
    StructField("NUM", IntegerType(), True),
    StructField("NAME", StringType(), True),
    StructField("CITY", StringType(), True),
    StructField("STATE", StringType(), True),
    StructField("ID", IntegerType(), True),
])

df = spark.read.option("header", "false") \
    .schema(schema) \
    .csv("hdfs://namenode:9000/data/openbeer/breweries/breweries.csv")

# разделим данные на несколько файлов (например, на 4)
df = df.repartition(4)

# сохраним в Parquet
df.write.mode("overwrite").parquet("hdfs://namenode:9000/data/openbeer/breweries_parquet")

print("✅ CSV сконвертирован в Parquet и сохранён в HDFS!")


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/09 14:55:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/09 14:55:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/10/09 14:56:10 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/10/09 14:56:25 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/10/09 14:56:40 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/10/09 14:56:55 WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your clu

✅ CSV сконвертирован в Parquet и сохранён в HDFS!


                                                                                

Теперь создадим таблицу в hive

In [15]:
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS breweries_parquet(
    NUM INT,
    NAME STRING,
    CITY STRING,
    STATE STRING,
    ID INT
)
STORED AS PARQUET
LOCATION 'hdfs://namenode:9000/data/openbeer/breweries_parquet'
""")

DataFrame[]

In [16]:
# проверяем, что таблица появилась
spark.sql("SHOW TABLES").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
| openbeer|        breweries|      false|
| openbeer|breweries_parquet|      false|
+---------+-----------------+-----------+



In [18]:
# проверка данных
spark.sql("SELECT * FROM openbeer.breweries_parquet").show()

+---+--------------------+------------------+-----+---+
|NUM|                NAME|              CITY|STATE| ID|
+---+--------------------+------------------+-----+---+
| 77|    Blue Owl Brewing|            Austin|   TX| 77|
|515|Crabtree Brewing ...|           Greeley|   CO|515|
|222|Carton Brewing Co...|Atlantic Highlands|   NJ|222|
|512|    Cottrell Brewing|         Pawcatuck|   CT|512|
|332|La Cumbre Brewing...|       Albuquerque|   NM|332|
|425|           Ciderboys|     Stevens Point|   WI|425|
|483|Bale Breaker Brew...|            Yakima|   WA|483|
|291|Tommyknocker Brewery|     Idaho Springs|   CO|291|
|350|Central Coast Bre...|   San Luis Obispo|   CA|350|
|334|The Traveler Beer...|        Burlington|   VT|334|
|327|Half Acre Beer Co...|           Chicago|   IL|327|
|299|Matt Brewing Company|             Utica|   NY|299|
|311|Dirty Bucket Brew...|       Woodinville|   WA|311|
|479|     Heavy Seas Beer|        Halethorpe|   MD|479|
|528|Asheville Brewing...|         Asheville|   