# Test vtl analytics invocation




In [1]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, rand

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

22/04/11 16:58:06 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/04/11 16:58:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/11 16:58:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
data=[("A", "XX", 2000, 3, 1),
    ("A", "XX", 2001, 4, 9),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2003, 6, 8),
    ("A", "YY", 2000, 9, 3),
    ("A", "YY", 2001, 5, 4),
    ("A", "YY", 2002, 10, 2),
    ("A", "YY", 2003, 5, 7)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   9|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2001|   5|   4|
|   A|  YY|2002|  10|   2|
|   A|  YY|2003|   5|   7|
+----+----+----+----+----+



                                                                                

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   9|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2001|   5|   4|
|   A|  YY|2002|  10|   2|
|   A|  YY|2003|   5|   7|
+----+----+----+----+----+



In [5]:
df.printSchema()

root
 |-- Id_1: string (nullable = true)
 |-- Id_2: string (nullable = true)
 |-- Id_3: integer (nullable = true)
 |-- Me_1: integer (nullable = true)
 |-- Me_2: integer (nullable = true)



# VTL window function

origin df

DS_1:
|Id_1| Id_2| Id_3| Me_1| Me_2|
|----|-----|------|----|-----|
|A |XX |2000| 3 |1|
|A |XX |2001 |4 |9|
|A |XX |2002 |7 |5|
|A |XX |2003 |6 |8|
|A |YY |2000 |9 |3|
|A |YY |2001 |5 |4|
|A |YY |2002 |10 |2|
|A |YY |2003 |5 |7|


DS_r := DS_1 [ calc Me2 := rank ( over ( partition by Id_1 , Id_2 order by Me_1 ) ) ]

DS_r
|Id_1| Id_2| Id_3| Me_1| Me_2|
|----|-----|------|----|-----|
|A |XX |2000 |3 |1|
|A |XX |2001 |4 |2|
|A |XX |2003 |6 |3|
|A |XX |2002 |7 |4|
|A |YY |2001 |5 |1|
|A |YY| 2003 |5 |1|
|A |YY |2000 |9 |3|
|A |YY |2002 |10 |4|


In [12]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank,col,desc
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Me_1").asc()]
win_name_order=win_name.orderBy(order_col_name)


df1 = df.drop("Me_2").withColumn("Me_2", rank().over(win_name_order))
df1.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  YY|2001|   5|   1|
|   A|  YY|2003|   5|   1|
|   A|  YY|2000|   9|   3|
|   A|  YY|2002|  10|   4|
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   2|
|   A|  XX|2003|   6|   3|
|   A|  XX|2002|   7|   4|
+----+----+----+----+----+

