In [1]:
from pyspark.sql import SparkSession
import os

In [2]:
local=False
if local:
    spark=SparkSession.builder.master("local[4]") \
                  .appName("aida_poc_etl").getOrCreate()
else:
    spark=SparkSession.builder \
                      .master("k8s://https://kubernetes.default.svc:443") \
                      .appName("aida_poc_etl") \
                      .config("spark.kubernetes.container.image",os.environ["IMAGE_NAME"]) \
                      .config("spark.kubernetes.authenticate.driver.serviceAccountName",os.environ['KUBERNETES_SERVICE_ACCOUNT']) \
                      .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE']) \
                      .config("spark.executor.instances", "10") \
                      .config("spark.executor.memory","16g") \
                      .config("spark.driver.memory","32g") \
                      .config('spark.jars.packages','org.postgresql:postgresql:42.2.24') \
                      .getOrCreate()



:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6b17ba73-85b3-4c31-95b4-3c3d1b8f69ee;1.0
	confs: [default]
	found org.postgresql#postgresql;42.2.24 in central
	found org.checkerframework#checker-qual;3.5.0 in central
:: resolution report :: resolve 170ms :: artifacts dl 6ms
	:: modules in use:
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.2.24 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-------------------------------------------

In [5]:
work_dir="s3a://projet-poc-aida/rp"
file_name="individus.csv"
file_path=f"{work_dir}/{file_name}"

# use option
df=spark.read\
    .option("header",True)\
    .option("inferSchema", True) \
    .option("delimiter",';') \
    .csv(path=file_path)

                                                                                

In [7]:
# df.cache()
df.show(5,truncate=False)

+----------------+---------------------+-----------------+--------------+-------------------+---------------+------------------+-------------+--------------+------------------+----+----------+-------+----------------+-------+---+---------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+
|region_residence|departement_residence|commune_residence|region_travail|departement_travail|commune_travail|commune_anterieure|commune_etude|pays_naissance|poids             |sexe|statut_pro|densite|recherche_emplo

In [8]:
! mc ls --summarize s3/projet-poc-aida/rp/individus.csv | grep "Total Size"

Total Size: 14 GiB


In [9]:
df.count()

                                                                                

50000000

In [10]:
df.printSchema()

root
 |-- region_residence: integer (nullable = true)
 |-- departement_residence: string (nullable = true)
 |-- commune_residence: string (nullable = true)
 |-- region_travail: integer (nullable = true)
 |-- departement_travail: string (nullable = true)
 |-- commune_travail: string (nullable = true)
 |-- commune_anterieure: string (nullable = true)
 |-- commune_etude: string (nullable = true)
 |-- pays_naissance: integer (nullable = true)
 |-- poids: double (nullable = true)
 |-- sexe: integer (nullable = true)
 |-- statut_pro: string (nullable = true)
 |-- densite: integer (nullable = true)
 |-- recherche_emploi: integer (nullable = true)
 |-- diplome: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- variable00: long (nullable = true)
 |-- variable01: integer (nullable = true)
 |-- variable02: integer (nullable = true)
 |-- variable03: integer (nullable = true)
 |-- variable04: double (nullable = true)
 |-- variable05: double (nullable = true)
 |-- variable06: intege

In [11]:
partition_number=df.rdd.getNumPartitions()
print(f"Data partition number: {partition_number}")

Data partition number: 109


In [12]:
output_file="individus_snappy_parquet"

output_path=f"{work_dir}/{output_file}"

df.write.mode("overwrite").parquet(output_path)

                                                                                

In [11]:
! mc ls --summarize s3/projet-poc-aida/rp/individus_snappy_parquet | grep "Total Size"

Total Size: 1.1 GiB


In [16]:
partition_output_file="individus_partition_region"
col_name="region_residence"

partition_output_path=f"{work_dir}/{partition_output_file}"

In [14]:


df1=df.withColumn("region_str",col("region_residence").cast(StringType()))

In [17]:
df1.write.partitionBy("region_str").mode("overwrite").parquet(partition_output_path)

                                                                                

In [17]:
from pyspark.sql.types import StructType, IntegerType,StringType
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col

# Query 1

select sum(poids), sexe, densite, statut_pro from TABLE where region_residence=’44’ and region_travail!=’44’ and statut_pro !=’Z’ group by  sexe, densite, statut_pro

In [18]:
import time;


def run_query_1(df):
    start = time.time()
    tmp=df.filter((col("region_residence")==44) & (col("region_travail")!=44) & (col("statut_pro")!="Z")).select("region_residence","region_travail","poids","sexe","densite","statut_pro")
    print("Data frame after filter")
    tmp.show()
    print("Final resutl data frame")
    df_resu1=tmp.groupBy("sexe","densite","statut_pro").agg(sum(col("poids")))
    df_resu1.show()
    end = time.time()
    print(f"Total time spent: {end-start}")
    

In [19]:
run_query_1(df)

Data frame after filter
+----------------+--------------+------------------+----+-------+----------+
|region_residence|region_travail|             poids|sexe|densite|statut_pro|
+----------------+--------------+------------------+----+-------+----------+
|              44|            52|0.9173226192328938|   2|      4|         1|
|              44|            27|1.0844752868227385|   2|      4|         1|
|              44|            84|1.4552633116136917|   1|      4|         1|
|              44|            32|1.2880844787953543|   1|      3|         2|
|              44|            76|0.8567941093656907|   1|      4|         1|
|              44|            75| 1.360188435113091|   1|      4|         1|
|              44|            76|0.8156473498675924|   2|      2|         1|
|              44|            75|1.2422037759139486|   2|      4|         1|
|              44|            76|0.5489129674794979|   2|      4|         2|
|              44|            84| 1.420871377293334|



+----+-------+----------+------------------+
|sexe|densite|statut_pro|        sum(poids)|
+----+-------+----------+------------------+
|   1|      4|         1| 84860.02165580635|
|   1|      3|         1| 81647.73008880654|
|   1|      3|         2|20318.627070021386|
|   2|      2|         2| 4594.674335848757|
|   2|      4|         2| 21197.94139432258|
|   2|      3|         1|  81280.1828071123|
|   2|      4|         1| 85804.86044169618|
|   1|      1|         1|2422.4620618296367|
|   2|      3|         2|  20696.8599381208|
|   1|      4|         2| 21014.73402813378|
|   1|      2|         1|18598.935870467652|
|   2|      1|         1| 2403.512048985781|
|   2|      2|         1| 18421.38674213923|
|   1|      2|         2|4560.4938174609615|
|   1|      1|         2|  607.757923308453|
|   2|      1|         2| 592.3437398684349|
+----+-------+----------+------------------+

Total time spent: 36.96817231178284


                                                                                

## do op with parquet


In [20]:
parquet_file_name="individus_snappy_parquet"
partition_parquet_name="individus-region-residence.parquet"
parquet_path=f"{work_dir}/{parquet_file_name}"

partition_parquet_path=f"{work_dir}/{partition_parquet_name}"

In [21]:
df_parquet=spark.read.parquet(parquet_path)

In [22]:
df_parquet_partition=spark.read.parquet(partition_parquet_path)

In [23]:
run_query_1(df_parquet)

Data frame after filter


                                                                                

+----------------+--------------+------------------+----+-------+----------+
|region_residence|region_travail|             poids|sexe|densite|statut_pro|
+----------------+--------------+------------------+----+-------+----------+
|              44|            76|0.6964892028349067|   1|      3|         1|
|              44|            53|0.9099529312417702|   1|      3|         2|
|              44|            84|1.3965981643575496|   2|      4|         1|
|              44|            76|0.9166255577323749|   1|      2|         1|
|              44|            28|1.4835491154585285|   2|      3|         2|
|              44|            76|0.8553348903506489|   2|      4|         1|
|              44|            32|0.6249929984339587|   2|      4|         2|
|              44|            32|0.6787379373345154|   1|      2|         1|
|              44|            75|1.2816300236916485|   2|      3|         1|
|              44|            27|1.3839504158949147|   1|      4|         1|



+----+-------+----------+------------------+
|sexe|densite|statut_pro|        sum(poids)|
+----+-------+----------+------------------+
|   1|      4|         1| 84860.02165580637|
|   1|      3|         1| 81647.73008880642|
|   1|      3|         2| 20318.62707002139|
|   2|      2|         2| 4594.674335848756|
|   2|      4|         2|21197.941394322577|
|   2|      3|         1| 81280.18280711232|
|   2|      4|         1| 85804.86044169607|
|   1|      1|         1|2422.4620618296367|
|   2|      3|         2|20696.859938120797|
|   1|      4|         2|21014.734028133782|
|   1|      2|         1|18598.935870467667|
|   2|      1|         1|2403.5120489857813|
|   2|      2|         1|18421.386742139224|
|   1|      2|         2|  4560.49381746096|
|   1|      1|         2| 607.7579233084527|
|   2|      1|         2| 592.3437398684353|
+----+-------+----------+------------------+

Total time spent: 13.087724924087524


                                                                                

In [24]:
run_query_1(df_parquet_partition)

Data frame after filter


                                                                                

+----------------+--------------+------------------+----+-------+----------+
|region_residence|region_travail|             poids|sexe|densite|statut_pro|
+----------------+--------------+------------------+----+-------+----------+
|              44|            76|0.8238963726995286|   1|      4|         2|
|              44|            28| 1.319271725135387|   2|      2|         1|
|              44|            75|0.6567601781470883|   2|      3|         2|
|              44|            84| 1.253597183134104|   1|      4|         1|
|              44|            76|0.8731775260562563|   1|      4|         2|
|              44|            84| 0.536695960149986|   1|      3|         1|
|              44|            93| 1.396082818263767|   2|      1|         1|
|              44|            75|0.5569451529336501|   2|      3|         1|
|              44|            32|0.9938252776533797|   1|      4|         1|
|              44|            24|0.5336664630957909|   1|      4|         1|



+----+-------+----------+------------------+
|sexe|densite|statut_pro|        sum(poids)|
+----+-------+----------+------------------+
|   1|      4|         1| 84860.02165580633|
|   1|      3|         1| 81647.73008880646|
|   1|      3|         2|20318.627070021397|
|   2|      2|         2| 4594.674335848756|
|   2|      4|         2|21197.941394322577|
|   2|      3|         1| 81280.18280711232|
|   2|      4|         1| 85804.86044169606|
|   1|      1|         1|2422.4620618296367|
|   2|      3|         2|20696.859938120782|
|   1|      4|         2|21014.734028133775|
|   1|      2|         1|18598.935870467656|
|   2|      1|         1| 2403.512048985781|
|   2|      2|         1|18421.386742139228|
|   1|      2|         2|  4560.49381746096|
|   1|      1|         2| 607.7579233084527|
|   2|      1|         2| 592.3437398684353|
+----+-------+----------+------------------+

Total time spent: 12.353963613510132


                                                                                