In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
import os
import boto3

def define_conf(conf:SparkConf, notebook_name, notebook_namespace, s3_access_key, s3_secret_key, s3_endpoint):
    
    # Spark driver, executor 설정
    conf.set("spark.submit.deployMode", "client")
    conf.set("spark.executor.instances", "1")
    conf.set("spark.executor.memory", "1G")
    conf.set("spark.driver.memory", "1G")
    conf.set("spark.executor.cores", "1")
    conf.set("spark.kubernetes.namespace", notebook_namespace)
    conf.set("spark.kubernetes.container.image", "paasup/spark:3.5.2-java17-python3.11-2")
    conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "default-editor")
    conf.set("spark.kubernetes.driver.pod.name", os.environ["HOSTNAME"])
    conf.set("spark.driver.bindAddress", "0.0.0.0")
    conf.set("spark.driver.host", notebook_name+ "-headless." + notebook_namespace + ".svc.cluster.local")
    conf.set("spark.driver.port", "51810")        
    conf.set("spark.broadcast.port", "51811")     
    conf.set("spark.blockManager.port", "51812")

    # s3, deltalake 사용 시 필요한 jar 패키지 설정
    jar_list = ",".join([
    "org.apache.hadoop:hadoop-common:3.3.4",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk:1.11.655", 
    "io.delta:delta-spark_2.12:3.3.1"
    ])
    conf.set("spark.jars.packages", jar_list)  

    # s3 세팅
    conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')
    conf.set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'true')
    conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    conf.set('spark.hadoop.fs.s3a.access.key', s3_access_key)
    conf.set('spark.hadoop.fs.s3a.secret.key', s3_secret_key)
    conf.set('spark.hadoop.fs.s3a.endpoint', s3_endpoint)

    ### ssl 검증 비활성화
    conf.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.sdk.disableCertChecking=true")
    conf.set("spark.executor.extraJavaOptions", "-Dcom.amazonaws.sdk.disableCertChecking=true")

    # deltalake 세팅
    conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")    

ModuleNotFoundError: No module named 'pyspark'

In [2]:
# spark session 설정 : s3 access용 param 
notebook_name      = "test"
notebook_namespace = "demo01-kf"

s3_endpoint        = "https://172.16.50.29:9000"
s3_access_key      = "3JHV0NGHJP378DJ4CXJ484GFFTIQQ2CW5CQFWJBM64K0QN2LSTIKKBJP1"
s3_secret_key      = "Z74G5VRH0SPPNUXY22K1MWPCCYS2PVZWHDWQ57L9YXO67DDXUE663DCTWDMTCA"

conf = SparkConf()
define_conf(
    conf=conf,
    notebook_name      = notebook_name,
    notebook_namespace = notebook_namespace,
    s3_access_key      = s3_access_key, 
    s3_secret_key      = s3_secret_key,
    s3_endpoint        = s3_endpoint
)

In [None]:
# spark session build 
appname = 'test_select_delta'
spark = SparkSession.builder.config(conf=conf)\
    .appName(appname)\
    .master("k8s://https://kubernetes.default.svc.cluster.local:443")\
    .getOrCreate()

In [25]:
# deltalake read

delta_sql = f"""
                        select  `사용일자` use_date
                           ,`노선명` line_no
                           ,`역명`  station_name
                           ,cast(`승차총승객수` as int) pass_in
                           ,cast(`하차총승객수` as int) pass_out
                           ,`등록일자` reg_date
                        from delta.s3a://deltalake/subway_passengers
                        where `사용일자` like '202504'||'%'
                    """    
delta_df = spark.sql(insert_sql)
df_read = spark.read.format("delta").load('s3a://deltalake/subway_passengers')
df_read.show()




                                                                                

+--------+-------+-----------------------+-------------+--------------+--------+
|use_date|line_no|           station_name|passengers_in|passengers_out|reg_date|
+--------+-------+-----------------------+-------------+--------------+--------+
|20250401|  2호선|                   시청|        31730|         30459|20250404|
|20250401|  2호선|             을지로입구|        55089|         57583|20250404|
|20250401|  2호선|              을지로3가|        27974|         28001|20250404|
|20250401|  2호선|              을지로4가|        17026|         17012|20250404|
|20250401|  2호선|동대문역사문화공원(DDP)|        15742|         18349|20250404|
|20250401|  2호선|                   신당|        16285|         16750|20250404|
|20250401|  2호선|               상왕십리|        16398|         16250|20250404|
|20250401|  2호선|       왕십리(성동구청)|        17431|         15366|20250404|
|20250401|  2호선|                 한양대|        18819|         20960|20250404|
|20250401|  2호선|                   뚝섬|        28369|         30008|20250404|
|20250401

In [None]:
# #source file 삭제 (confinue...)
# s3 = boto3.resource('s3')
# bucket = s3.Bucket('dataload')
# s3.Object('dataload', 'CARD_SUBWAY_MONTH_202504.csv').delete()


In [26]:
# load data수 query 
query = ' select count(*) from subway_passengers '
spark.sql(query).show()

+--------+
|count(1)|
+--------+
|   18515|
+--------+



In [27]:
# spark session 종료 
spark.stop()

25/05/13 06:24:33 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
