# Test vtl analytics invocation




In [34]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, rand,col, rank, desc, first, last,sum
from pyspark.sql.window import Window


In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-07-04 16:36:49,331 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data=[("A", "XX", 1993, 3, 1.0),
    ("A", "XX", 1994, 4, 9.0),
    ("A", "XX", 1995, 7, 5.0),
    ("A", "XX", 1996, 6, 8.0),
    ("A", "YY", 1993, 9, 3.0),
    ("A", "YY", 1994, 5, 4.0),
    ("A", "YY", 1995, 10, 2.0),
    ("A", "YY", 1996, 2, 7.0)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Year",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",DoubleType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

                                                                                

+----+----+----+----+----+
|Id_1|Id_2|Year|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|1993|   3| 1.0|
|   A|  XX|1994|   4| 9.0|
|   A|  XX|1995|   7| 5.0|
|   A|  XX|1996|   6| 8.0|
|   A|  YY|1993|   9| 3.0|
|   A|  YY|1994|   5| 4.0|
|   A|  YY|1995|  10| 2.0|
|   A|  YY|1996|   2| 7.0|
+----+----+----+----+----+



In [4]:
df.printSchema()

root
 |-- Id_1: string (nullable = true)
 |-- Id_2: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Me_1: integer (nullable = true)
 |-- Me_2: double (nullable = true)



# 2. VTL advance Analytic function

## 2.1 Rank 

**Analytic clause restriction**

- Must have `orderClause`
- The `windowClause` such as `data points` and `range` are not allowed

### 2.1.1 Exp1 : window has partition clause and order by clause


```text
res := ds1 [calc rank_col:= rank ( over ( partition by Id_1, Id_2 order by Year) )]
```

**Note: rank function requires rows must be ordered, so we can't apply rank on clause that only have partition**

In [5]:



partition_col_names=["Id_1","Id_2"]
order_col_names=[col("Year").asc()]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names)

new_col_name=f"rank_col"


df_rank=df.withColumn(new_col_name,rank().over(win_name)) 
df_rank.show()



+----+----+----+----+----+--------+
|Id_1|Id_2|Year|Me_1|Me_2|rank_col|
+----+----+----+----+----+--------+
|   A|  XX|1993|   3| 1.0|       1|
|   A|  XX|1994|   4| 9.0|       2|
|   A|  XX|1995|   7| 5.0|       3|
|   A|  XX|1996|   6| 8.0|       4|
|   A|  YY|1993|   9| 3.0|       1|
|   A|  YY|1994|   5| 4.0|       2|
|   A|  YY|1995|  10| 2.0|       3|
|   A|  YY|1996|   2| 7.0|       4|
+----+----+----+----+----+--------+



### 2.1.2 Exp2 : window has partition clause and order by clause with desc


```text
res := ds1 [calc rank_col:= rank ( over ( partition by Id_1, Id_2 order by Year desc) )]
```

In [6]:
partition_col_names=["Id_1","Id_2"]
order_col_names=[col("Year").desc()]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names)

new_col_name=f"rank_col"


df_rank=df.withColumn(new_col_name,rank().over(win_name)) 
df_rank.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Year|Me_1|Me_2|rank_col|
+----+----+----+----+----+--------+
|   A|  XX|1996|   6| 8.0|       1|
|   A|  XX|1995|   7| 5.0|       2|
|   A|  XX|1994|   4| 9.0|       3|
|   A|  XX|1993|   3| 1.0|       4|
|   A|  YY|1996|   2| 7.0|       1|
|   A|  YY|1995|  10| 2.0|       2|
|   A|  YY|1994|   5| 4.0|       3|
|   A|  YY|1993|   9| 3.0|       4|
+----+----+----+----+----+--------+



### 2.1.3 Exp3 : window has partition clause, order by clause and data points (not allowed in VTL, possible in spark)


```text
res := ds1 [calc rank_col:= rank ( over ( partition by Id_1, Id_2 order by Year desc data points between unbounded preceding and current data point) )]
```

**Note rank function can't not take rolling window such as [-1,1]. It requires [unboundedPreceding,currentRow]**. This window specification has no effect, because it's identical to default rolling window definition. So no need to use `data points` and `range between` in rank

In [7]:
partition_col_names=["Id_1","Id_2"]
order_col_names=[col("Year")]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names).rowsBetween(Window.unboundedPreceding,Window.currentRow)

new_col_name=f"rank_col"


df_rank=df.withColumn(new_col_name,rank().over(win_name)) 
df_rank.show()


+----+----+----+----+----+--------+
|Id_1|Id_2|Year|Me_1|Me_2|rank_col|
+----+----+----+----+----+--------+
|   A|  XX|1993|   3| 1.0|       1|
|   A|  XX|1994|   4| 9.0|       2|
|   A|  XX|1995|   7| 5.0|       3|
|   A|  XX|1996|   6| 8.0|       4|
|   A|  YY|1993|   9| 3.0|       1|
|   A|  YY|1994|   5| 4.0|       2|
|   A|  YY|1995|  10| 2.0|       3|
|   A|  YY|1996|   2| 7.0|       4|
+----+----+----+----+----+--------+



## 2.2 First

### 2.2.1 Exp1 : window has partition clause


```text
res :=  first_value ( ds1 over ( partition by Id_1, Id_2) )
```


In [8]:
from pyspark.sql.functions import first, last

partition_col_names=["Id_1","Id_2"]


win_name=Window.partitionBy(partition_col_names)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"first_{target_col_name1}"
new_col_name2=f"first_{target_col_name2}"

df_first=df.withColumn(new_col_name1,first(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,first(target_col_name2).over(win_name))


df_first.show()

+----+----+----+----+----+----------+----------+
|Id_1|Id_2|Year|Me_1|Me_2|first_Me_1|first_Me_2|
+----+----+----+----+----+----------+----------+
|   A|  XX|1993|   3| 1.0|         3|       1.0|
|   A|  XX|1994|   4| 9.0|         3|       1.0|
|   A|  XX|1995|   7| 5.0|         3|       1.0|
|   A|  XX|1996|   6| 8.0|         3|       1.0|
|   A|  YY|1993|   9| 3.0|         9|       3.0|
|   A|  YY|1994|   5| 4.0|         9|       3.0|
|   A|  YY|1995|  10| 2.0|         9|       3.0|
|   A|  YY|1996|   2| 7.0|         9|       3.0|
+----+----+----+----+----+----------+----------+



### 2.2.2 Exp2 : window has partition clause and order by clause

vtl query

```text
res :=  first_value ( ds1 over ( partition by Id_1, Id_2 order by Year desc) )
```

In [9]:


partition_col_names=["Id_1","Id_2"]

order_col_names=[col("Year").desc()]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"first_{target_col_name1}"
new_col_name2=f"first_{target_col_name2}"

df_first=df.withColumn(new_col_name1,first(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,first(target_col_name2).over(win_name))


df_first.show()

+----+----+----+----+----+----------+----------+
|Id_1|Id_2|Year|Me_1|Me_2|first_Me_1|first_Me_2|
+----+----+----+----+----+----------+----------+
|   A|  XX|1996|   6| 8.0|         6|       8.0|
|   A|  XX|1995|   7| 5.0|         6|       8.0|
|   A|  XX|1994|   4| 9.0|         6|       8.0|
|   A|  XX|1993|   3| 1.0|         6|       8.0|
|   A|  YY|1996|   2| 7.0|         2|       7.0|
|   A|  YY|1995|  10| 2.0|         2|       7.0|
|   A|  YY|1994|   5| 4.0|         2|       7.0|
|   A|  YY|1993|   9| 3.0|         2|       7.0|
+----+----+----+----+----+----------+----------+



### 2.2.3 Exp : window has partition, order by and data point

The vtl query:

```text
res := first_value ( ds1 over ( partition by Id_1 order by Id_2 data points between 2 preceding and 2 following) )
```

In [10]:
partition_col_names=["Id_1"]

order_col_names=[col("Id_2")]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names).rowsBetween(-2,2)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"first_{target_col_name1}"
new_col_name2=f"first_{target_col_name2}"

df_first=df.withColumn(new_col_name1,first(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,first(target_col_name2).over(win_name))


df_first.show()

+----+----+----+----+----+----------+----------+
|Id_1|Id_2|Year|Me_1|Me_2|first_Me_1|first_Me_2|
+----+----+----+----+----+----------+----------+
|   A|  XX|1993|   3| 1.0|         3|       1.0|
|   A|  XX|1994|   4| 9.0|         3|       1.0|
|   A|  XX|1995|   7| 5.0|         3|       1.0|
|   A|  XX|1996|   6| 8.0|         4|       9.0|
|   A|  YY|1993|   9| 3.0|         7|       5.0|
|   A|  YY|1994|   5| 4.0|         6|       8.0|
|   A|  YY|1995|  10| 2.0|         9|       3.0|
|   A|  YY|1996|   2| 7.0|         5|       4.0|
+----+----+----+----+----+----------+----------+



### 2.2.4 Exp : window has partition, order by and range 

The vtl query:

```text
res := first_value ( ds1 over ( partition by Id_1 order by Year range between -1 and 1) )
```

In [11]:
partition_col_names=["Id_1"]

order_col_names=[col("Year")]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names).rangeBetween(-1,1)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"first_{target_col_name1}"
new_col_name2=f"first_{target_col_name2}"

df_first=df.withColumn(new_col_name1,first(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,first(target_col_name2).over(win_name))


df_first.show()

+----+----+----+----+----+----------+----------+
|Id_1|Id_2|Year|Me_1|Me_2|first_Me_1|first_Me_2|
+----+----+----+----+----+----------+----------+
|   A|  XX|1993|   3| 1.0|         3|       1.0|
|   A|  YY|1993|   9| 3.0|         3|       1.0|
|   A|  XX|1994|   4| 9.0|         3|       1.0|
|   A|  YY|1994|   5| 4.0|         3|       1.0|
|   A|  XX|1995|   7| 5.0|         4|       9.0|
|   A|  YY|1995|  10| 2.0|         4|       9.0|
|   A|  XX|1996|   6| 8.0|         7|       5.0|
|   A|  YY|1996|   2| 7.0|         7|       5.0|
+----+----+----+----+----+----------+----------+



## 2.3 Last

The spark last does not correspond the specification of VTL. So here we just reverse the order and use first() to simulate last() function. For more details, please visit

### 2.3.1 Exp1 : window has partition clause


```text
res :=  last_value ( ds1 over ( partition by Id_1, Id_2) )
```


In [13]:
partition_col_names=["Id_1","Id_2"]


win_name=Window.partitionBy(partition_col_names)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"last_{target_col_name1}"
new_col_name2=f"last_{target_col_name2}"

df_last=df.withColumn(new_col_name1,last(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,last(target_col_name2).over(win_name))


df_last.show()

+----+----+----+----+----+---------+---------+
|Id_1|Id_2|Year|Me_1|Me_2|last_Me_1|last_Me_2|
+----+----+----+----+----+---------+---------+
|   A|  XX|1993|   3| 1.0|        6|      8.0|
|   A|  XX|1994|   4| 9.0|        6|      8.0|
|   A|  XX|1995|   7| 5.0|        6|      8.0|
|   A|  XX|1996|   6| 8.0|        6|      8.0|
|   A|  YY|1993|   9| 3.0|        2|      7.0|
|   A|  YY|1994|   5| 4.0|        2|      7.0|
|   A|  YY|1995|  10| 2.0|        2|      7.0|
|   A|  YY|1996|   2| 7.0|        2|      7.0|
+----+----+----+----+----+---------+---------+



### 2.3.2 Exp2 : window has partition clause and order by clause

vtl query

```text
res :=  last_value ( ds1 over ( partition by Id_1, Id_2 order by Year desc) )
```

In [14]:
partition_col_names=["Id_1","Id_2"]
order_col_names=[col("Year").desc()]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"last_{target_col_name1}"
new_col_name2=f"last_{target_col_name2}"

df_last=df.withColumn(new_col_name1,last(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,last(target_col_name2).over(win_name))


df_last.show()

+----+----+----+----+----+---------+---------+
|Id_1|Id_2|Year|Me_1|Me_2|last_Me_1|last_Me_2|
+----+----+----+----+----+---------+---------+
|   A|  XX|1996|   6| 8.0|        6|      8.0|
|   A|  XX|1995|   7| 5.0|        7|      5.0|
|   A|  XX|1994|   4| 9.0|        4|      9.0|
|   A|  XX|1993|   3| 1.0|        3|      1.0|
|   A|  YY|1996|   2| 7.0|        2|      7.0|
|   A|  YY|1995|  10| 2.0|       10|      2.0|
|   A|  YY|1994|   5| 4.0|        5|      4.0|
|   A|  YY|1993|   9| 3.0|        9|      3.0|
+----+----+----+----+----+---------+---------+



### 2.3.3 Exp : window has partition, order by and data point

The vtl query:

```text
res := last_value ( ds1 over ( partition by Id_1 order by Id_2 data points between 2 preceding and 2 following) )
```

In [15]:
partition_col_names=["Id_1"]
order_col_names=[col("Id_2")]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names).rowsBetween(-2,2)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"last_{target_col_name1}"
new_col_name2=f"last_{target_col_name2}"

df_last=df.withColumn(new_col_name1,last(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,last(target_col_name2).over(win_name))


df_last.show()

+----+----+----+----+----+---------+---------+
|Id_1|Id_2|Year|Me_1|Me_2|last_Me_1|last_Me_2|
+----+----+----+----+----+---------+---------+
|   A|  XX|1993|   3| 1.0|        7|      5.0|
|   A|  XX|1994|   4| 9.0|        6|      8.0|
|   A|  XX|1995|   7| 5.0|        9|      3.0|
|   A|  XX|1996|   6| 8.0|        5|      4.0|
|   A|  YY|1993|   9| 3.0|       10|      2.0|
|   A|  YY|1994|   5| 4.0|        2|      7.0|
|   A|  YY|1995|  10| 2.0|        2|      7.0|
|   A|  YY|1996|   2| 7.0|        2|      7.0|
+----+----+----+----+----+---------+---------+



### 2.3.4 Exp : window has partition, order by and range 

The vtl query:

```text
res := last_value ( ds1 over ( partition by Id_1, Id_2 order by Year range between -1 and 1) )
```

In [16]:
partition_col_names=["Id_1","Id_2"]
order_col_names=[col("Year")]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names).rangeBetween(-1,1)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"last_{target_col_name1}"
new_col_name2=f"last_{target_col_name2}"

df_last=df.withColumn(new_col_name1,last(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,last(target_col_name2).over(win_name))


df_last.show()

+----+----+----+----+----+---------+---------+
|Id_1|Id_2|Year|Me_1|Me_2|last_Me_1|last_Me_2|
+----+----+----+----+----+---------+---------+
|   A|  XX|1993|   3| 1.0|        4|      9.0|
|   A|  XX|1994|   4| 9.0|        7|      5.0|
|   A|  XX|1995|   7| 5.0|        6|      8.0|
|   A|  XX|1996|   6| 8.0|        6|      8.0|
|   A|  YY|1993|   9| 3.0|        5|      4.0|
|   A|  YY|1994|   5| 4.0|       10|      2.0|
|   A|  YY|1995|  10| 2.0|        2|      7.0|
|   A|  YY|1996|   2| 7.0|        2|      7.0|
+----+----+----+----+----+---------+---------+



#### 2.3.5 Exp to compare with official example


```text
res :=  = last_value ( ds1 over ( partition by Id_1, Id_2 order by Id_3 data points between 1 preceding and 1 following ) 
```

page 178, the official doc is wrong

results in: DS_r

```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 4 9
A XX 1994 7 9
A XX 1995 7 9
A XX 1996 7 8
A YY 1993 9 4
A YY 1994 10 4
A YY 1995 10 7
A YY 1996 10 7
```

In [17]:
data1=[("A", "XX", 1993, 3, 1.0),
    ("A", "XX", 1994, 4, 9.0),
    ("A", "XX", 1995, 7, 5.0),
    ("A", "XX", 1996, 6, 8.0),
    ("A", "YY", 1993, 9, 3.0),
    ("A", "YY", 1994, 5, 4.0),
    ("A", "YY", 1995, 10, 2.0),
    ("A", "YY", 1996, 2, 7.0)]

schema1=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Year",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",DoubleType(),True)])

df1=spark.createDataFrame(data1, schema1)
df1.show()

+----+----+----+----+----+
|Id_1|Id_2|Year|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|1993|   3| 1.0|
|   A|  XX|1994|   4| 9.0|
|   A|  XX|1995|   7| 5.0|
|   A|  XX|1996|   6| 8.0|
|   A|  YY|1993|   9| 3.0|
|   A|  YY|1994|   5| 4.0|
|   A|  YY|1995|  10| 2.0|
|   A|  YY|1996|   2| 7.0|
+----+----+----+----+----+



In [39]:
partition_col_names=["Id_1","Id_2"]

order_col_names=[col("Year")]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names).rowsBetween(-1,1)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"last_{target_col_name1}"
new_col_name2=f"last_{target_col_name2}"

df_last1=df1.withColumn(new_col_name1,last(target_col_name1).over(win_name))\
           .withColumn(new_col_name2,last(target_col_name2).over(win_name))


df_last1.show()

+----+----+----+----+----+---------+---------+
|Id_1|Id_2|Year|Me_1|Me_2|last_Me_1|last_Me_2|
+----+----+----+----+----+---------+---------+
|   A|  XX|1993|   3| 1.0|        4|      9.0|
|   A|  XX|1994|   4| 9.0|        7|      5.0|
|   A|  XX|1995|   7| 5.0|        6|      8.0|
|   A|  XX|1996|   6| 8.0|        6|      8.0|
|   A|  YY|1993|   9| 3.0|        5|      4.0|
|   A|  YY|1994|   5| 4.0|       10|      2.0|
|   A|  YY|1995|  10| 2.0|        2|      7.0|
|   A|  YY|1996|   2| 7.0|        2|      7.0|
+----+----+----+----+----+---------+---------+



## 2.4 Lead()

Like rank function, the window must be order, if you want to apply lead function

The lead function take two argument:
- input dataframe
- step

**Analytic clause restriction**

- Must have `orderClause`
- The `windowClause` such as `data points` and `range` are not allowed

### 2.4.1 Exp1 : window has partition clause and order by clause

below vtl query takes ds1 as input df, and 1 as step

```text
res :=  lead ( ds1,1 over ( partition by Id_1, Id_2 order by Year desc) )
```



In [25]:
partition_col_names=["Id_1","Id_2"]

order_col_names=[col("Year").desc()]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"lead_{target_col_name1}"
new_col_name2=f"lead_{target_col_name2}"

step=1

df_lead=df.withColumn(new_col_name1,lead(target_col_name1,step).over(win_name))\
           .withColumn(new_col_name2,lead(target_col_name2,step).over(win_name))


df_lead.show()

+----+----+----+----+----+---------+---------+
|Id_1|Id_2|Year|Me_1|Me_2|lead_Me_1|lead_Me_2|
+----+----+----+----+----+---------+---------+
|   A|  XX|1996|   6| 8.0|        7|      5.0|
|   A|  XX|1995|   7| 5.0|        4|      9.0|
|   A|  XX|1994|   4| 9.0|        3|      1.0|
|   A|  XX|1993|   3| 1.0|     null|     null|
|   A|  YY|1996|   2| 7.0|       10|      2.0|
|   A|  YY|1995|  10| 2.0|        5|      4.0|
|   A|  YY|1994|   5| 4.0|        9|      3.0|
|   A|  YY|1993|   9| 3.0|     null|     null|
+----+----+----+----+----+---------+---------+



### 2.4.2 Test to confirm example in official doc
DS_r := lead ( ds1 , 1 over ( partition by Id_1 , Id_2 order by Year ) )

Input: DS_1
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 3 1
A XX 1994 4 9
A XX 1995 7 5
A XX 1996 6 8
A YY 1993 9 3
A YY 1994 5 4
A YY 1995 10 2
A YY 1996 2 7
```

Output : DS_r
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 4 9
A XX 1994 7 5
A XX 1995 6 8
A XX 1996 NULL NULL
A YY 1993 5 4
A YY 1994 10 2
A YY 1995 2 7
A YY 1996 NULL NULL
```

In [26]:
from pyspark.sql.functions import lead, lag
partition_col_names=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_names)

order_col_names=[col("Year").asc()]
win_name_order=win_name.orderBy(order_col_names)
df_input=df
col_names=["Me_1","Me_2"]
step=1
for col_name in col_names:
    lead_col_name=f"lead_{col_name}"
    df_input=df_input.select("*",lead(col_name,1).over(win_name_order).alias(lead_col_name)).drop(col_name).withColumnRenamed(lead_col_name,col_name)
df_input.show()

+----+----+----+----+----+
|Id_1|Id_2|Year|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|1993|   4| 9.0|
|   A|  XX|1994|   7| 5.0|
|   A|  XX|1995|   6| 8.0|
|   A|  XX|1996|null|null|
|   A|  YY|1993|   5| 4.0|
|   A|  YY|1994|  10| 2.0|
|   A|  YY|1995|   2| 7.0|
|   A|  YY|1996|null|null|
+----+----+----+----+----+



## 2.5 Lag()

**function spec**:

In the ordered set of Data Points of the current partition, the operator returns the value(s) taken from the Data Point at the specified physical offset prior to the current Data Point.

If defaultValue is not specified then the value returned when the offset goes outside the partition is NULL.

The lead function take two argument:
- input dataframe
- step

**Analytic clause restriction**

- Must have `orderClause`
- The `windowClause` such as `data points` and `range` are not allowed

### 2.5.1 Exp1 : window has partition clause and order by clause

below vtl query takes ds1 as input df, and 2 as step

```text
res :=  lead ( ds1,2 over ( partition by Id_1, Id_2 order by Year desc) )
```



In [29]:
partition_col_names=["Id_1","Id_2"]

order_col_names=[col("Year").desc()]

win_name=Window.partitionBy(partition_col_names).orderBy(order_col_names)

target_col_name1="Me_1"
target_col_name2="Me_2"
new_col_name1=f"lag_{target_col_name1}"
new_col_name2=f"lag_{target_col_name2}"

step=2

df_lag=df.withColumn(new_col_name1,lead(target_col_name1,step).over(win_name))\
           .withColumn(new_col_name2,lead(target_col_name2,step).over(win_name))


df_lag.show()

+----+----+----+----+----+--------+--------+
|Id_1|Id_2|Year|Me_1|Me_2|lag_Me_1|lag_Me_2|
+----+----+----+----+----+--------+--------+
|   A|  XX|1996|   6| 8.0|       4|     9.0|
|   A|  XX|1995|   7| 5.0|       3|     1.0|
|   A|  XX|1994|   4| 9.0|    null|    null|
|   A|  XX|1993|   3| 1.0|    null|    null|
|   A|  YY|1996|   2| 7.0|       5|     4.0|
|   A|  YY|1995|  10| 2.0|       9|     3.0|
|   A|  YY|1994|   5| 4.0|    null|    null|
|   A|  YY|1993|   9| 3.0|    null|    null|
+----+----+----+----+----+--------+--------+



### 2.5.2 Test to confirm example in official doc

VTL query:

```text
DS_r := lag ( DS_1 , 1 over ( partition by Id_1 , Id_2 order by Id_3 ) ) 

```

Input: DS_1
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 3 1
A XX 1994 4 9
A XX 1995 7 5
A XX 1996 6 8
A YY 1993 9 3
A YY 1994 5 4
A YY 1995 10 2
A YY 1996 2 7

```

Output: DS_r
```text
Id_1 Id_2 Id_3 Me_1 Me_2
A XX 1993 NULL NULL
A XX 1994 3 1
A XX 1995 4 9
A XX 1996 7 5
A YY 1993 NULL NULL
A YY 1994 9 3
A YY 1995 5 4
A YY 1996 10 2
```

In [28]:
from pyspark.sql.functions import lead, lag
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Year").asc()]
win_name_order=win_name.orderBy(order_col_name)
df_input=df
col_names=["Me_1","Me_2"]
step=1
for col_name in col_names:
    lead_col_name=f"lead_{col_name}"
    df_input=df_input.select("*",lag(col_name,1).over(win_name_order).alias(lead_col_name)).drop(col_name).withColumnRenamed(lead_col_name,col_name)
df_input.show()

+----+----+----+----+----+
|Id_1|Id_2|Year|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|1993|null|null|
|   A|  XX|1994|   3| 1.0|
|   A|  XX|1995|   4| 9.0|
|   A|  XX|1996|   7| 5.0|
|   A|  YY|1993|null|null|
|   A|  YY|1994|   9| 3.0|
|   A|  YY|1995|   5| 4.0|
|   A|  YY|1996|  10| 2.0|
+----+----+----+----+----+



## 2.6 ratio_to_report()

**function spec**
The operator returns the ratio between the value of the current Data Point and the sum of the values of the partition which the current Data Point belongs to.

**Analytic clause restriction**
The `orderClause` and `windowClause` of the Analytic invocation syntax are not allowed.

### 2.6.1 window only has partition

ETL query

```text
DS_r := ratio_to_report ( ds1 over ( partition by Id_1, Id_2 ) )
```



In [63]:
def ratio_to_report(df, partition_col_names, measurment_col_names):
    win_name=Window.partitionBy(partition_col_names)
    for col_name in measurment_col_names:
        total_col_name=f"total_{col_name}"
        df=df.withColumn(total_col_name,sum(col_name).over(win_name)).withColumn(f"ratio_{col_name}",col(col_name)/col(f"total_{col_name}"))
    return df
    

In [62]:
partition_col_names=["Id_1","Id_2"]

measurment_col_names=["Me_1","Me_2"]

df_ratio=ratio_to_report(df,partition_col_names,measurment_col_names)

df_ratio.show()

+----+----+----+----+----+----------+-------------------+----------+--------------------+
|Id_1|Id_2|Year|Me_1|Me_2|total_Me_1|         ratio_Me_1|total_Me_2|          ratio_Me_2|
+----+----+----+----+----+----------+-------------------+----------+--------------------+
|   A|  XX|1993|   3| 1.0|        20|               0.15|      23.0|0.043478260869565216|
|   A|  XX|1994|   4| 9.0|        20|                0.2|      23.0|   0.391304347826087|
|   A|  XX|1995|   7| 5.0|        20|               0.35|      23.0| 0.21739130434782608|
|   A|  XX|1996|   6| 8.0|        20|                0.3|      23.0| 0.34782608695652173|
|   A|  YY|1993|   9| 3.0|        26|0.34615384615384615|      16.0|              0.1875|
|   A|  YY|1994|   5| 4.0|        26|0.19230769230769232|      16.0|                0.25|
|   A|  YY|1995|  10| 2.0|        26|0.38461538461538464|      16.0|               0.125|
|   A|  YY|1996|   2| 7.0|        26|0.07692307692307693|      16.0|              0.4375|
+----+----

### 2.6.2 Test to confirm example in official doc

ETL query

```text
DS_r := ratio_to_report ( ds1 over ( partition by Id_1, Id_2 ) )
```


Input ds1

```text

Id_1 Id_2 Id_3 Me_1 Me_2
A XX 2000 3 1
A XX 2001 4 3
A XX 2002 7 5
A XX 2003 6 1
A YY 2000 12 0
A YY 2001 8 8
A YY 2002 6 5
A YY 2003 14 -3

```

output: DS_r

```text
Id_1 Id_2 Id_3 Me_1 Me_2
A YY 2000 0.3 0
A YY 2001 0.2 0.8
A YY 2002 0.15 0.5
A YY 2003 0.35 -0.3
A XX 2000 0.15 0,1
A XX 2001 0.2 0.3
A XX 2002 0.35 0.5
A XX 2003 0.3 0.1

```

In [65]:
data1=[
    ("A", "XX", 2001, 4, 3),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2000, 3, 1),
    ("A", "XX", 2003, 6, 1),
    ("A", "YY", 2000, 12, 0),
    ("A", "YY", 2001, 8, 8),
    ("A", "YY", 2002, 6, 5),
    ("A", "YY", 2003, 14, -3)]

schema1=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df1=spark.createDataFrame(data1, schema1)
df1.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2001|   4|   3|
|   A|  XX|2002|   7|   5|
|   A|  XX|2000|   3|   1|
|   A|  XX|2003|   6|   1|
|   A|  YY|2000|  12|   0|
|   A|  YY|2001|   8|   8|
|   A|  YY|2002|   6|   5|
|   A|  YY|2003|  14|  -3|
+----+----+----+----+----+



In [66]:
partition_col_names=["Id_1","Id_2"]

measurment_col_names=["Me_1","Me_2"]

df_ratio=ratio_to_report(df1,partition_col_names,measurment_col_names)

df_ratio.show()

+----+----+----+----+----+----------+----------+----------+----------+
|Id_1|Id_2|Id_3|Me_1|Me_2|total_Me_1|ratio_Me_1|total_Me_2|ratio_Me_2|
+----+----+----+----+----+----------+----------+----------+----------+
|   A|  XX|2001|   4|   3|        20|       0.2|        10|       0.3|
|   A|  XX|2002|   7|   5|        20|      0.35|        10|       0.5|
|   A|  XX|2000|   3|   1|        20|      0.15|        10|       0.1|
|   A|  XX|2003|   6|   1|        20|       0.3|        10|       0.1|
|   A|  YY|2000|  12|   0|        40|       0.3|        10|       0.0|
|   A|  YY|2001|   8|   8|        40|       0.2|        10|       0.8|
|   A|  YY|2002|   6|   5|        40|      0.15|        10|       0.5|
|   A|  YY|2003|  14|  -3|        40|      0.35|        10|      -0.3|
+----+----+----+----+----+----------+----------+----------+----------+



In [56]:
from pyspark.sql.functions import sum
partition_col_name=["Id_1","Id_2"]
win_name=Window.partitionBy(partition_col_name)

order_col_name=[col("Id_3").asc()]
win_name_order=win_name.orderBy(rand(100))

col_names=["Me_1","Me_2"]
for col_name in col_names:
    total_col_name=f"total_{col_name}"
    df1=df1.withColumn(total_col_name,sum(col_name).over(win_name)).withColumn(f"ratio_{col_name}",col(col_name)/col(f"total_{col_name}")).drop(total_col_name).drop(col_name).withColumnRenamed(total_col_name,col_name)
df1.show()

+----+----+----+----------+----------+
|Id_1|Id_2|Id_3|ratio_Me_1|ratio_Me_2|
+----+----+----+----------+----------+
|   A|  XX|2001|       0.2|       0.3|
|   A|  XX|2002|      0.35|       0.5|
|   A|  XX|2000|      0.15|       0.1|
|   A|  XX|2003|       0.3|       0.1|
|   A|  YY|2000|       0.3|       0.0|
|   A|  YY|2001|       0.2|       0.8|
|   A|  YY|2002|      0.15|       0.5|
|   A|  YY|2003|      0.35|      -0.3|
+----+----+----+----------+----------+

