# VTL Simple analytic function



In [3]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, count,sum,avg,first

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

22/04/13 15:11:18 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/04/13 15:11:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/13 15:11:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/13 15:11:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [8]:
data=[("A", "XX", 2000, 3, 1),
    ("A", "XX", 2011, 4, 9),
    ("A", "XX", 2022, 7, 5),
    ("A", "XX", 2023, 6, 8),
    ("A", "YY", 2000, 9, 3),
    ("A", "YY", 2011, 5, 4),
    ("A", "YY", 2022, 10, 2),
    ("A", "YY", 2023, 5, 7)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2011|   4|   9|
|   A|  XX|2022|   7|   5|
|   A|  XX|2023|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2011|   5|   4|
|   A|  YY|2022|  10|   2|
|   A|  YY|2023|   5|   7|
+----+----+----+----+----+



In [9]:
from pyspark.sql.window import Window

# VTL WindowClause

windowClause ::= { data points | range } between limitClause and limitClausecount ( DS_1 over ( partition by Id_1 ) )

It specifies how to apply a sliding window on the ordered Data Points. The keyword **data points** means that the sliding window includes a certain number of Data Points before and after the current Data Point in the order given by the orderClause. The keyword **range** means that the sliding windows includes all the Data Points whose values are in a certain range in respect to the value, for the current Data Point, of the Measure which the analytic is applied to.

## data points example


DS_r := first_value ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 data points between 1 preceding and 1 following) )

The current row index is 0. The window frame is 1 preceding, and 1 following, so the window always contains three rows with index [-1,0,1] at most. Note that the frame could not overwrite the window of the partition. For example, for the last row of one partition, you will only have two rows [-1,0], because current row is the last row, and the next row is in another partition.  For the first row of one partition it's the same, you will only have two rows [1,0]

In [11]:
partition_col_names=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_names).orderBy("Id_3").rowsBetween(-1,1)
target_col_name="Me_1"
new_col_name=f"first_value_{target_col_name}"
df_collect=df.withColumn(new_col_name,first(target_col_name).over(win_name))
df_collect.show()

+----+----+----+----+----+----------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|first_value_Me_1|
+----+----+----+----+----+----------------+
|   A|  YY|2000|   9|   3|               9|
|   A|  YY|2011|   5|   4|               9|
|   A|  YY|2022|  10|   2|               5|
|   A|  YY|2023|   5|   7|              10|
|   A|  XX|2000|   3|   1|               3|
|   A|  XX|2011|   4|   9|               3|
|   A|  XX|2022|   7|   5|               4|
|   A|  XX|2023|   6|   8|               7|
+----+----+----+----+----+----------------+



## Range example
DS_r := first_value ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 range between 1 preceding and 1 following) )

The range use the value of the order by column of the current row as baseline. For example, with the above query, the order by column is **Id_3**.
As a result, the baseline column is **Id_3**. With range between 1 preceding and 1 following, we now use the value of current row of column **Id_3** to build the window. For the first row, the value of **Id_3** is 2000, so the window range is 2000-1=1999 and 2000+1=2001.

In [10]:
partition_col_names=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_names).orderBy("Id_3").rangeBetween(-1,1)
target_col_name="Me_1"
new_col_name=f"first_value_{target_col_name}"
df_collect=df.withColumn(new_col_name,first(target_col_name).over(win_name))
df_collect.show()

+----+----+----+----+----+----------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|first_value_Me_1|
+----+----+----+----+----+----------------+
|   A|  YY|2000|   9|   3|               9|
|   A|  YY|2011|   5|   4|               5|
|   A|  YY|2022|  10|   2|              10|
|   A|  YY|2023|   5|   7|              10|
|   A|  XX|2000|   3|   1|               3|
|   A|  XX|2011|   4|   9|               4|
|   A|  XX|2022|   7|   5|               7|
|   A|  XX|2023|   6|   8|               7|
+----+----+----+----+----+----------------+

