# Test vtl analytics invocation




In [1]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, rand

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

22/04/12 13:50:58 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/04/12 13:50:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/12 13:50:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [25]:
data=[("A", "XX", 2000, 3, 1),
    ("A", "XX", 2001, 4, 9),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2003, 6, 8),
    ("A", "YY", 2000, 9, 3),
    ("A", "YY", 2001, 5, 4),
    ("A", "YY", 2002, 10, 2),
    ("A", "YY", 2003, 5, 7)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   9|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2001|   5|   4|
|   A|  YY|2002|  10|   2|
|   A|  YY|2003|   5|   7|
+----+----+----+----+----+



In [4]:
df.printSchema()

root
 |-- Id_1: string (nullable = true)
 |-- Id_2: string (nullable = true)
 |-- Id_3: integer (nullable = true)
 |-- Me_1: integer (nullable = true)
 |-- Me_2: integer (nullable = true)



# VTL window function

## 1.1 Rank Example

origin df

DS_1:
|Id_1| Id_2| Id_3| Me_1| Me_2|
|----|-----|------|----|-----|
|A |XX |2000| 3 |1|
|A |XX |2001 |4 |9|
|A |XX |2002 |7 |5|
|A |XX |2003 |6 |8|
|A |YY |2000 |9 |3|
|A |YY |2001 |5 |4|
|A |YY |2002 |10 |2|
|A |YY |2003 |5 |7|


DS_r := DS_1 [ calc Me2 := rank ( over ( partition by Id_1 , Id_2 order by Me_1 ) ) ]

DS_r
|Id_1| Id_2| Id_3| Me_1| Me_2|
|----|-----|------|----|-----|
|A |XX |2000 |3 |1|
|A |XX |2001 |4 |2|
|A |XX |2003 |6 |3|
|A |XX |2002 |7 |4|
|A |YY |2001 |5 |1|
|A |YY| 2003 |5 |1|
|A |YY |2000 |9 |3|
|A |YY |2002 |10 |4|


In [5]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank,col,desc
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Me_1").asc()]
win_name_order=win_name.orderBy(order_col_name)


df1 = df.drop("Me_2").withColumn("Me_2", rank().over(win_name_order))
df1.show()

                                                                                

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  YY|2001|   5|   1|
|   A|  YY|2003|   5|   1|
|   A|  YY|2000|   9|   3|
|   A|  YY|2002|  10|   4|
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   2|
|   A|  XX|2003|   6|   3|
|   A|  XX|2002|   7|   4|
+----+----+----+----+----+



## 1.2 First() Example

DS_r := first_value ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 data points between 1 preceding and 1 following) )


In [7]:
from pyspark.sql.functions import first, last

partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Id_3").asc()]
win_name_order=win_name.orderBy(order_col_name)

df_first=df.withColumn("first",first("Me_1").over(win_name_order))
df_first.show()

+----+----+----+----+----+-----+
|Id_1|Id_2|Id_3|Me_1|Me_2|first|
+----+----+----+----+----+-----+
|   A|  YY|2000|   9|   3|    9|
|   A|  YY|2001|   5|   4|    9|
|   A|  YY|2002|  10|   2|    9|
|   A|  YY|2003|   5|   7|    9|
|   A|  XX|2000|   3|   1|    3|
|   A|  XX|2001|   4|   9|    3|
|   A|  XX|2002|   7|   5|    3|
|   A|  XX|2003|   6|   8|    3|
+----+----+----+----+----+-----+



## 1.3 Last

The spark last does not correspond the specification of VTL. So here we just reverse the order and use first() to simulate last() function. For more details, please visit



In [31]:
# here we reverse the order of windows to use first to get last
order_col_name=[col("Id_3").desc()]
win_name_order=win_name.orderBy(order_col_name)

df_last=df.withColumn("last",first("Me_1").over(win_name_order))
df_last.show()

+----+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|last|
+----+----+----+----+----+----+
|   A|  YY|2003|   5|   7|   5|
|   A|  YY|2002|  10|   2|   5|
|   A|  YY|2001|   5|   4|   5|
|   A|  YY|2000|   9|   3|   5|
|   A|  XX|2003|   6|   8|   6|
|   A|  XX|2002|   7|   5|   6|
|   A|  XX|2001|   4|   9|   6|
|   A|  XX|2000|   3|   1|   6|
+----+----+----+----+----+----+



## 1.4 Lead()

DS_r := lead ( DS_1 , 1 over ( partition by Id_1 , Id_2 order by Id_3 ) )

Input: DS_1
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 3 1
A XX 1994 4 9
A XX 1995 7 5
A XX 1996 6 8
A YY 1993 9 3
A YY 1994 5 4
A YY 1995 10 2
A YY 1996 2 7
```

Output : DS_r
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 4 9
A XX 1994 7 5
A XX 1995 6 8
A XX 1996 NULL NULL
A YY 1993 5 4
A YY 1994 10 2
A YY 1995 2 7
A YY 1996 NULL NULL
```


In [27]:
from pyspark.sql.functions import lead, lag
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Id_3").asc()]
win_name_order=win_name.orderBy(order_col_name)
df_input=df
col_names=["Me_1","Me_2"]
step=1
for col_name in col_names:
    lead_col_name=f"lead_{col_name}"
    df_input=df_input.select("*",lead(col_name,1).over(win_name_order).alias(lead_col_name)).drop(col_name).withColumnRenamed(lead_col_name,col_name)
df_input.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  YY|2000|   5|   4|
|   A|  YY|2001|  10|   2|
|   A|  YY|2002|   5|   7|
|   A|  YY|2003|null|null|
|   A|  XX|2000|   4|   9|
|   A|  XX|2001|   7|   5|
|   A|  XX|2002|   6|   8|
|   A|  XX|2003|null|null|
+----+----+----+----+----+



## 1.5 Lag()

In the ordered set of Data Points of the current partition, the operator returns the value(s) taken from the Data Point at the specified physical offset prior to the current Data Point.

If defaultValue is not specified then the value returned when the offset goes outside the partition is NULL.

VTL query: DS_r := lag ( DS_1 , 1 over ( partition by Id_1 , Id_2 order by Id_3 ) ) results in:
Input: DS_1
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 3 1
A XX 1994 4 9
A XX 1995 7 5
A XX 1996 6 8
A YY 1993 9 3
A YY 1994 5 4
A YY 1995 10 2
A YY 1996 2 7

```

Output: DS_r
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 NULL NULL
A XX 1994 3 1
A XX 1995 4 9
A XX 1996 7 5
A YY 1993 NULL NULL
A YY 1994 9 3
A YY 1995 5 4
A YY 1996 10 2
```


In [28]:
from pyspark.sql.functions import lead, lag
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Id_3").asc()]
win_name_order=win_name.orderBy(order_col_name)
df_input=df
col_names=["Me_1","Me_2"]
step=1
for col_name in col_names:
    lead_col_name=f"lead_{col_name}"
    df_input=df_input.select("*",lag(col_name,1).over(win_name_order).alias(lead_col_name)).drop(col_name).withColumnRenamed(lead_col_name,col_name)
df_input.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  YY|2000|null|null|
|   A|  YY|2001|   9|   3|
|   A|  YY|2002|   5|   4|
|   A|  YY|2003|  10|   2|
|   A|  XX|2000|null|null|
|   A|  XX|2001|   3|   1|
|   A|  XX|2002|   4|   9|
|   A|  XX|2003|   7|   5|
+----+----+----+----+----+



## 1.6 ratio_to_report()

The operator returns the ratio between the value of the current Data Point and the sum of the values of the partition which the current Data Point belongs to.

ETL example

DS_r := ratio_to_report ( DS_1 over ( partition by Id_1, Id_2 ) )

Note, here even though the above vtl request does not specify which column we need to apply the ratio_to_report on. The result calculate the ratio_to_report on column, Me_1 and Me_2. Because VTL by default will apply all function without parameter on all column which has property **Measurement**

Input DS_1

```text

Id_1 Id_2 Id_3 Me_1 Me_2
A XX 2000 3 1
A XX 2001 4 3
A XX 2002 7 5
A XX 2003 6 1
A YY 2000 12 0
A YY 2001 8 8
A YY 2002 6 5
A YY 2003 14 -3

```

output: DS_r

```text
Id_1 Id_2 Id_3 Me_1 Me_2
A YY 2000 0.3 0
A YY 2001 0.2 0.8
A YY 2002 0.15 0.5
A YY 2003 0.35 -0.3
A XX 2000 0.15 0,1
A XX 2001 0.2 0.3
A XX 2002 0.35 0.5
A XX 2003 0.3 0.1

```

In [9]:
data1=[
    ("A", "XX", 2001, 4, 3),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2000, 3, 1),
    ("A", "XX", 2003, 6, 1),
    ("A", "YY", 2000, 12, 0),
    ("A", "YY", 2001, 8, 8),
    ("A", "YY", 2002, 6, 5),
    ("A", "YY", 2003, 14, -3)]

schema1=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df1=spark.createDataFrame(data1, schema1)
df1.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   3|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   1|
|   A|  YY|2000|  12|   0|
|   A|  YY|2001|   8|   8|
|   A|  YY|2002|   6|   5|
|   A|  YY|2003|  14|  -3|
+----+----+----+----+----+



In [19]:
from pyspark.sql.functions import sum
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Id_3").asc()]
win_name_order=win_name.orderBy(rand(100))

col_names=["Me_1","Me_2"]
for col_name in col_names:
    total_col_name=f"total_{col_name}"
    df1=df1.withColumn(total_col_name,sum(col_name).over(win_name)).withColumn(f"ratio_{col_name}",col(col_name)/col(f"total_{col_name}")).drop(total_col_name).drop(col_name).withColumnRenamed(total_col_name,col_name)
df1.show()

+----+----+----+----------+----------+
|Id_1|Id_2|Id_3|ratio_Me_1|ratio_Me_2|
+----+----+----+----------+----------+
|   A|  YY|2000|       0.3|       0.0|
|   A|  YY|2001|       0.2|       0.8|
|   A|  YY|2002|      0.15|       0.5|
|   A|  YY|2003|      0.35|      -0.3|
|   A|  XX|2000|      0.15|       0.1|
|   A|  XX|2001|       0.2|       0.3|
|   A|  XX|2002|      0.35|       0.5|
|   A|  XX|2003|       0.3|       0.1|
+----+----+----+----------+----------+

