In [1]:
from pyspark.sql import SparkSession
import os

In [2]:
local=False
if local:
    spark=SparkSession.builder.master("local[4]") \
                  .appName("aida_poc_etl").getOrCreate()
else:
    spark=SparkSession.builder \
                      .master("k8s://https://kubernetes.default.svc:443") \
                      .appName("aida_poc_etl") \
                      .config("spark.kubernetes.container.image",os.environ["IMAGE_NAME"]) \
                      .config("spark.kubernetes.authenticate.driver.serviceAccountName",os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
                      .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
                      .config("spark.executor.instances", "10") \
                      .config("spark.executor.memory","16g") \
                      .config("spark.driver.memory","32g") \
                      .config('spark.jars.packages','org.postgresql:postgresql:42.2.24') \
                      .enableHiveSupport() \
                      .getOrCreate()

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e6808f69-1307-411c-858a-c8ca6c63b0d1;1.0
	confs: [default]
	found org.postgresql#postgresql;42.2.24 in central
	found org.checkerframework#checker-qual;3.5.0 in central
:: resolution report :: resolve 154ms :: artifacts dl 6ms
	:: modules in use:
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.2.24 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-------------------------------------------

In [14]:
def set_log_level(spark_session,log_level:str):
    logger = spark_session.sparkContext._jvm.org.apache.log4j
    if log_level=="INFO":
        logger_level = logger.Level.INFO
    elif log_level=="WARN":
        logger_level = logger.Level.WARN
    elif log_level=="ERROR":
        logger_level = logger.Level.ERROR
    else:
        raise ValueError("The log_level must be INFO, WARN or ERROR")
    logger.LogManager.getLogger("org").setLevel(logger_level)
    logger.LogManager.getLogger("akka").setLevel(logger_level)

In [18]:
set_log_level(spark,"INFO")

In [3]:


work_dir="s3a://projet-poc-aida/rp"
parquet_file_name="individus_snappy_parquet"
data_path=f"{work_dir}/{parquet_file_name}"

# Step 1: Prepare source dataframe

Use spark context to read a parquet file and return a data frame 

In [19]:
df_parquet=spark.read.parquet(data_path)

2022-09-04 10:30:29,421 INFO datasources.InMemoryFileIndex: It took 49 ms to list leaf files for 1 paths.
2022-09-04 10:30:29,479 INFO spark.SparkContext: Starting job: parquet at NativeMethodAccessorImpl.java:0
2022-09-04 10:30:29,480 INFO scheduler.DAGScheduler: Got job 5 (parquet at NativeMethodAccessorImpl.java:0) with 1 output partitions
2022-09-04 10:30:29,480 INFO scheduler.DAGScheduler: Final stage: ResultStage 5 (parquet at NativeMethodAccessorImpl.java:0)
2022-09-04 10:30:29,480 INFO scheduler.DAGScheduler: Parents of final stage: List()
2022-09-04 10:30:29,481 INFO scheduler.DAGScheduler: Missing parents: List()
2022-09-04 10:30:29,482 INFO scheduler.DAGScheduler: Submitting ResultStage 5 (MapPartitionsRDD[15] at parquet at NativeMethodAccessorImpl.java:0), which has no missing parents
2022-09-04 10:30:29,503 INFO memory.MemoryStore: Block broadcast_7 stored as values in memory (estimated size 157.1 KiB, free 19.0 GiB)
2022-09-04 10:30:29,506 INFO memory.MemoryStore: Block b

In [17]:
df_parquet.show(5)

+----------------+---------------------+-----------------+--------------+-------------------+---------------+------------------+-------------+--------------+------------------+----+----------+-------+----------------+-------+---+---------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|region_residence|departement_residence|commune_residence|region_travail|departement_travail|commune_travail|commune_anterieure|commune_etude|pays_naissance|             poids|sexe|statut_pro|densite|recherche_emplo

                                                                                

# Step2: Create a table in hive metastore

Use the spark dataframe to create a hive table in the hive metastore. So we can reuse it for later. 


In [6]:
table_name="individus_test"

In [7]:
schema_str = ', '.join([' '.join(x) for x in df_parquet.dtypes])

spark.sql(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS {table_name}
({schema_str})
STORED as parquet LOCATION '{data_path}'
""")


2022-09-04 10:12:23,703 INFO conf.HiveConf: Found configuration file file:/opt/hive/conf/hive-site.xml
2022-09-04 10:12:23,732 INFO hive.HiveUtils: Initializing HiveMetastoreConnection version 2.3.9 using Spark classes.
2022-09-04 10:12:23,962 INFO client.HiveClientImpl: Warehouse location for Hive client (version 2.3.9) is file:/home/jovyan/work/Poc_Aida/notebook/spark-warehouse
2022-09-04 10:12:24,015 INFO hive.metastore: Trying to connect to metastore with URI thrift://hive-metastore:9083
2022-09-04 10:12:24,034 INFO hive.metastore: Opened a connection to metastore, current connections: 1
2022-09-04 10:12:24,092 INFO hive.metastore: Connected to metastore.
2022-09-04 10:12:24,454 INFO sqlstd.SQLStdHiveAccessController: Created SQLStdHiveAccessController for session context : HiveAuthzSessionContext [sessionString=5efb23be-b125-4a4b-806a-c66fd63fb1c6, clientType=HIVECLI]
2022-09-04 10:12:24,456 WARN session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.auth

DataFrame[]

In [8]:
spark.sql('show tables;').show()

2022-09-04 10:12:29,515 INFO codegen.CodeGenerator: Code generated in 31.947752 ms
2022-09-04 10:12:29,572 INFO codegen.CodeGenerator: Code generated in 8.157305 ms
2022-09-04 10:12:29,592 INFO codegen.CodeGenerator: Code generated in 8.772728 ms
2022-09-04 10:12:29,604 INFO spark.SparkContext: Starting job: showString at NativeMethodAccessorImpl.java:0
2022-09-04 10:12:29,605 INFO scheduler.DAGScheduler: Got job 2 (showString at NativeMethodAccessorImpl.java:0) with 1 output partitions
2022-09-04 10:12:29,606 INFO scheduler.DAGScheduler: Final stage: ResultStage 2 (showString at NativeMethodAccessorImpl.java:0)
2022-09-04 10:12:29,606 INFO scheduler.DAGScheduler: Parents of final stage: List()
2022-09-04 10:12:29,606 INFO scheduler.DAGScheduler: Missing parents: List()
2022-09-04 10:12:29,607 INFO scheduler.DAGScheduler: Submitting ResultStage 2 (MapPartitionsRDD[8] at showString at NativeMethodAccessorImpl.java:0), which has no missing parents
2022-09-04 10:12:29,612 INFO memory.Memo

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|     individus|      false|
|  default|individus_test|      false|
+---------+--------------+-----------+



2022-09-04 10:12:29,736 INFO storage.BlockManagerInfo: Added broadcast_4_piece0 in memory on 10.233.112.158:38863 (size: 3.8 KiB, free: 9.4 GiB)
2022-09-04 10:12:29,750 INFO scheduler.TaskSetManager: Finished task 0.0 in stage 3.0 (TID 3) in 36 ms on 10.233.112.158 (executor 9) (1/1)
2022-09-04 10:12:29,750 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 3.0, whose tasks have all completed, from pool 
2022-09-04 10:12:29,751 INFO scheduler.DAGScheduler: ResultStage 3 (showString at NativeMethodAccessorImpl.java:0) finished in 0.044 s
2022-09-04 10:12:29,751 INFO scheduler.DAGScheduler: Job 3 is finished. Cancelling potential speculative or zombie tasks for this job
2022-09-04 10:12:29,751 INFO scheduler.TaskSchedulerImpl: Killing all running tasks in stage 3: Stage finished
2022-09-04 10:12:29,751 INFO scheduler.DAGScheduler: Job 3 finished: showString at NativeMethodAccessorImpl.java:0, took 0.048112 s
2022-09-04 10:12:29,765 INFO codegen.CodeGenerator: Code generated in 11.346359 m

Now your hive table has been created. In the backgroud, if you enabled the listener, the metadata of this hive table will be uploaded to our [data catalog](https://atlas.lab.sspcloud.fr/index.html#!/search). So you can find all your hive table easily even you don't have notebook anymore.

You can try to use the search engine of our [data catalog](https://atlas.lab.sspcloud.fr/index.html#!/search) to find your table. 

In [9]:
spark.sql(f"""SELECT * FROM {table_name} limit 5""").show()



+----------------+---------------------+-----------------+--------------+-------------------+---------------+------------------+-------------+--------------+------------------+----+----------+-------+----------------+-------+---+---------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|region_residence|departement_residence|commune_residence|region_travail|departement_travail|commune_travail|commune_anterieure|commune_etude|pays_naissance|             poids|sexe|statut_pro|densite|recherche_emplo

                                                                                

# Step 3. Drop table 

You can delete your table if you don't need it anymore. You will notice the metadata of the deleted table are `removed` from the data catalog too.  

In [10]:
spark.sql(f"""drop table if exists {table_name}""").show()
spark.sql('show tables;').show()

++
||
++
++

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|individus|      false|
+---------+---------+-----------+

