In [3]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, count, var_samp
from pyspark.sql.window import Window

In [4]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

In [5]:
data=[("A", "XX", 2000, 3, 1),
    ("A", "XX", 2001, 4, 9),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2003, 6, 8),
    ("A", "YY", 2000, 9, 3),
    ("A", "YY", 2001, 5, 4),
    ("A", "YY", 2002, 10, 2),
    ("A", "YY", 2003, 5, 7)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

                                                                                

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   9|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2001|   5|   4|
|   A|  YY|2002|  10|   2|
|   A|  YY|2003|   5|   7|
+----+----+----+----+----+



## VarSamp Example

pyspark.sql.functions.var_samp(col):

Aggregate function: returns the unbiased sample variance of the values in a group.

In [6]:
# below calculation simulates the vtl syntax "res := var_pop ( ds1 over ( partition by Id_1 order by Year range between 1 preceding and 1 following) );"

win_name=Window.partitionBy("Id_1").orderBy("Id_3").rangeBetween(-1,1)
target_col1="Me_1"
target_col2="Me_2"
new_col_name1=f"var_samp_{target_col1}"
new_col_name2=f"var_samp_{target_col2}"
df_resu = df.withColumn(new_col_name1,var_samp(target_col1).over(win_name)) \
            .withColumn(new_col_name2,var_samp(target_col2).over(win_name))

In [7]:
df_resu.show()

+----+----+----+----+----+-----------------+------------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|    var_samp_Me_1|     var_samp_Me_2|
+----+----+----+----+----+-----------------+------------------+
|   A|  XX|2000|   3|   1|6.916666666666667|11.583333333333334|
|   A|  YY|2000|   9|   3|6.916666666666667|11.583333333333334|
|   A|  XX|2001|   4|   9|7.866666666666667| 8.000000000000002|
|   A|  YY|2001|   5|   4|7.866666666666667| 8.000000000000002|
|   A|  XX|2002|   7|   5|4.566666666666666| 6.966666666666667|
|   A|  YY|2002|  10|   2|4.566666666666666| 6.966666666666667|
|   A|  XX|2003|   6|   8|4.666666666666667|               7.0|
|   A|  YY|2003|   5|   7|4.666666666666667|               7.0|
+----+----+----+----+----+-----------------+------------------+

