In [5]:
from pyspark.sql import SparkSession,DataFrame

import os
from pyspark.sql.functions import col, sum

# Pivot

Given the Data Set DS_1:  (Line 7150)

```text
+----+----+----+----+
|Id_1|Id_2|Me_1|At_1|
+----+----+----+----+
|   1|   A|   5|   E|
|   1|   B|   2|   F|
|   1|   C|   7|   F|
|   2|   A|   3|   E|
|   2|   B|   4|   E|
|   2|   C|   9|   F|
+----+----+----+----+
```

Example1: DS_r := Ds_1 [ pivot Id_2, Me_1 ] results in:

```text
+----+---+---+---+
|Id_1|  A|  B|  C|
+----+---+---+---+
|   1|  5|  2|  7|
|   2|  3|  4|  9|
+----+---+---+---+
```

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLPIVOT")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLPIVOT")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "4g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

23/05/11 15:46:10 WARN Utils: Your hostname, pengfei-Virtual-Machine resolves to a loopback address: 127.0.1.1; using 10.50.2.80 instead (on interface eth0)
23/05/11 15:46:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/11 15:46:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/11 15:46:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
root_path="../../data"
data_path=f"{root_path}/pivot_ds1.csv"

df=spark.read.csv(data_path, header=True,inferSchema=True)
df.show()

+----+----+----+----+
|Id_1|Id_2|Me_1|At_1|
+----+----+----+----+
|   1|   A|   5|   E|
|   1|   B|   2|   F|
|   1|   C|   7|   F|
|   2|   A|   3|   E|
|   2|   B|   4|   E|
|   2|   C|   9|   F|
+----+----+----+----+



In [7]:
df.printSchema()

root
 |-- Id_1: integer (nullable = true)
 |-- Id_2: string (nullable = true)
 |-- Me_1: integer (nullable = true)
 |-- At_1: string (nullable = true)



In [11]:
# DS_r := Ds_1 [ pivot Id_2, Me_1 ]
identifier ="Id_2"
measure = "Me_1"
df_resu = df.groupby("Id_1").pivot(identifier).sum(measure)

> We need one more argument to make it work

In [12]:
df_resu.show()

+----+---+---+---+
|Id_1|  A|  B|  C|
+----+---+---+---+
|   1|  5|  2|  7|
|   2|  3|  4|  9|
+----+---+---+---+



In [8]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
df_test = spark.createDataFrame(data = data, schema = columns)
df_test.printSchema()
df_test.show(truncate=False)

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)



                                                                                

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [9]:
pivotDF = df_test.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()
pivotDF.show(truncate=False)

                                                                                

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |400  |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+

