In [11]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from delta.tables import DeltaTable
import os
import boto3

def define_conf(conf:SparkConf, notebook_name, notebook_namespace, s3_access_key, s3_secret_key, s3_endpoint):
    
    # Spark driver, executor 설정
    conf.set("spark.submit.deployMode", "client")
    conf.set("spark.executor.instances", "1")
    conf.set("spark.executor.memory", "1G")
    conf.set("spark.driver.memory", "1G")
    conf.set("spark.executor.cores", "1")
    conf.set("spark.kubernetes.namespace", notebook_namespace)
    conf.set("spark.kubernetes.container.image", "paasup/spark:3.5.2-java17-python3.11-2")
    conf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "default-editor")
    conf.set("spark.kubernetes.driver.pod.name", os.environ["HOSTNAME"])
    conf.set("spark.driver.bindAddress", "0.0.0.0")
    conf.set("spark.driver.host", notebook_name+ "-headless." + notebook_namespace + ".svc.cluster.local")
    conf.set("spark.driver.port", "51810")        
    conf.set("spark.broadcast.port", "51811")     
    conf.set("spark.blockManager.port", "51812")

    # s3, deltalake 사용 시 필요한 jar 패키지 설정
    jar_list = ",".join([
    "org.apache.hadoop:hadoop-common:3.3.4",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk:1.11.655", 
    "io.delta:delta-spark_2.12:3.3.1"
    ])
    conf.set("spark.jars.packages", jar_list)  

    # s3 세팅
    conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    conf.set('spark.hadoop.fs.s3a.path.style.access', 'true')
    conf.set('spark.hadoop.fs.s3a.connection.ssl.enabled', 'true')
    conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    conf.set('spark.hadoop.fs.s3a.access.key', s3_access_key)
    conf.set('spark.hadoop.fs.s3a.secret.key', s3_secret_key)
    conf.set('spark.hadoop.fs.s3a.endpoint', s3_endpoint)

    ### ssl 검증 비활성화
    conf.set("spark.driver.extraJavaOptions", "-Dcom.amazonaws.sdk.disableCertChecking=true")
    conf.set("spark.executor.extraJavaOptions", "-Dcom.amazonaws.sdk.disableCertChecking=true")

    # deltalake 세팅
    conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")    

In [12]:
# spark session 설정 : s3 access용 param 
notebook_name      = "test"
notebook_namespace = "demo01-kf"

s3_endpoint        = "https://172.16.50.29:9000"
s3_access_key      = "3JHV0NGHJP378DJ4CXJ484GFFTIQQ2CW5CQFWJBM64K0QN2LSTIKKBJP1"
s3_secret_key      = "Z74G5VRH0SPPNUXY22K1MWPCCYS2PVZWHDWQ57L9YXO67DDXUE663DCTWDMTCA"

conf = SparkConf()
define_conf(
    conf=conf,
    notebook_name      = notebook_name,
    notebook_namespace = notebook_namespace,
    s3_access_key      = s3_access_key, 
    s3_secret_key      = s3_secret_key,
    s3_endpoint        = s3_endpoint
)

In [13]:
# spark session build 
appname = 'test_select_delta'
spark = SparkSession.builder.config(conf=conf)\
    .appName(appname)\
    .master("k8s://https://kubernetes.default.svc.cluster.local:443")\
    .getOrCreate()

In [24]:
# deltalake read

delta_sql = f"""
                        select  `사용일자` use_date
                           ,`노선명` line_no
                           ,`역명`  station_name
                           ,cast(`승차총승객수` as int) pass_in
                           ,cast(`하차총승객수` as int) pass_out
                           ,`등록일자` reg_date
                        from delta.`s3a://paasup/deltalake/subway_passengers`
                        where `사용일자` like '202504'||'%'
                    """    

#delta_sql = " select * from delta.`s3a://paasup/deltalake/subway_passengers` "
print ( delta_sql) 
result_df = spark.sql(delta_sql)

print(result_df)
#result_df.show(truncate=False)

# # temp view 생성 
# result_df.createOrReplaceTempView('subway_passengers')
# # temp view query 
# query = ' select * from subway_passengers '
# spark.sql(query).show(10)




                        select  `사용일자` use_date
                           ,`노선명` line_no
                           ,`역명`  station_name
                           ,cast(`승차총승객수` as int) pass_in
                           ,cast(`하차총승객수` as int) pass_out
                           ,`등록일자` reg_date
                        from delta.`s3a://paasup/deltalake/subway_passengers`
                        where `사용일자` like '202504'||'%'
                    


25/05/28 05:06:00 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 48) (10.42.9.199 executor 1): java.lang.NoSuchMethodError: 'java.lang.String com.amazonaws.util.AwsHostNameUtils.parseRegionFromInternalConfig(java.lang.String)'
	at com.amazonaws.regions.EndpointToRegion.guessRegionOrRegionNameForEndpoint(EndpointToRegion.java:86)
	at com.amazonaws.regions.EndpointToRegion.guessRegionNameForEndpoint(EndpointToRegion.java:41)
	at com.amazonaws.services.s3.internal.auth.S3SignerProvider.getSigner(S3SignerProvider.java:62)
	at com.amazonaws.http.ExecutionContext.getSigner(ExecutionContext.java:135)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor$ExecOneRequestParams.newSigner(AmazonHttpClient.java:1846)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1255)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1113)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpCl

Py4JJavaError: An error occurred while calling o250.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage 6.0 (TID 54) (10.42.9.199 executor 1): java.lang.NoSuchMethodError: 'java.lang.String com.amazonaws.util.AwsHostNameUtils.parseRegionFromInternalConfig(java.lang.String)'
	at com.amazonaws.regions.EndpointToRegion.guessRegionOrRegionNameForEndpoint(EndpointToRegion.java:86)
	at com.amazonaws.regions.EndpointToRegion.guessRegionNameForEndpoint(EndpointToRegion.java:41)
	at com.amazonaws.services.s3.internal.auth.S3SignerProvider.getSigner(S3SignerProvider.java:62)
	at com.amazonaws.http.ExecutionContext.getSigner(ExecutionContext.java:135)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor$ExecOneRequestParams.newSigner(AmazonHttpClient.java:1846)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1255)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1113)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:770)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:744)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:726)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:686)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:668)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:532)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:512)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403)
	at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1372)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$10(S3AFileSystem.java:2545)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2533)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2513)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3776)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.extractOrFetchSimpleFileStatus(S3AFileSystem.java:5401)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:1465)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$null$44(S3AFileSystem.java:5479)
	at org.apache.hadoop.util.LambdaUtils.eval(LambdaUtils.java:52)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$openFileWithOptions$45(S3AFileSystem.java:5478)
	at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.lang.NoSuchMethodError: 'java.lang.String com.amazonaws.util.AwsHostNameUtils.parseRegionFromInternalConfig(java.lang.String)'
	at com.amazonaws.regions.EndpointToRegion.guessRegionOrRegionNameForEndpoint(EndpointToRegion.java:86)
	at com.amazonaws.regions.EndpointToRegion.guessRegionNameForEndpoint(EndpointToRegion.java:41)
	at com.amazonaws.services.s3.internal.auth.S3SignerProvider.getSigner(S3SignerProvider.java:62)
	at com.amazonaws.http.ExecutionContext.getSigner(ExecutionContext.java:135)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor$ExecOneRequestParams.newSigner(AmazonHttpClient.java:1846)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1255)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1113)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:770)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:744)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:726)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:686)
	at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:668)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:532)
	at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:512)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456)
	at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403)
	at com.amazonaws.services.s3.AmazonS3Client.getObjectMetadata(AmazonS3Client.java:1372)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$getObjectMetadata$10(S3AFileSystem.java:2545)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414)
	at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2533)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.getObjectMetadata(S3AFileSystem.java:2513)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3776)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.extractOrFetchSimpleFileStatus(S3AFileSystem.java:5401)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.open(S3AFileSystem.java:1465)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$null$44(S3AFileSystem.java:5479)
	at org.apache.hadoop.util.LambdaUtils.eval(LambdaUtils.java:52)
	at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$openFileWithOptions$45(S3AFileSystem.java:5478)
	at java.base/java.util.concurrent.FutureTask.run(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.base/java.lang.Thread.run(Unknown Source)


In [None]:
# #source file 삭제 (confinue...)
# s3 = boto3.resource('s3')
# bucket = s3.Bucket('dataload')
# s3.Object('dataload', 'CARD_SUBWAY_MONTH_202504.csv').delete()


In [26]:
# load data수 query 
query = ' select count(*) from subway_passengers '
spark.sql(query).show()

+--------+
|count(1)|
+--------+
|   18515|
+--------+



In [27]:
# spark session 종료 
spark.stop()

25/05/13 06:24:33 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
