# VTL Simple analytic function



In [2]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, count,sum,avg,collect_list,min,max,percentile_approx,stddev_pop,stddev_samp,var_pop,var_samp
from pyspark.sql.window import Window

In [3]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

22/04/14 14:13:00 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/04/14 14:13:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/14 14:13:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
data=[("A", "XX", 2000, 3, 1),
    ("A", "XX", 2001, 4, 9),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2003, 6, 8),
    ("A", "YY", 2000, 9, 3),
    ("A", "YY", 2001, 5, 4),
    ("A", "YY", 2002, 10, 2),
    ("A", "YY", 2003, 5, 7)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

                                                                                

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   9|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2001|   5|   4|
|   A|  YY|2002|  10|   2|
|   A|  YY|2003|   5|   7|
+----+----+----+----+----+



## 1.1 Count

count ( DS_1 over ( partition by Id_1 ) )

In [5]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"count_{target_col_name}"
df_count=df.withColumn(new_col_name,count(target_col_name).over(win_name))
df_count.show()

                                                                                

+----+----+----+----+----+----------+
|Id_1|Id_2|Id_3|Me_1|Me_2|count_Me_1|
+----+----+----+----+----+----------+
|   A|  XX|2000|   3|   1|         8|
|   A|  XX|2001|   4|   9|         8|
|   A|  XX|2002|   7|   5|         8|
|   A|  XX|2003|   6|   8|         8|
|   A|  YY|2000|   9|   3|         8|
|   A|  YY|2001|   5|   4|         8|
|   A|  YY|2002|  10|   2|         8|
|   A|  YY|2003|   5|   7|         8|
+----+----+----+----+----+----------+



In [6]:
df.printSchema()

root
 |-- Id_1: string (nullable = true)
 |-- Id_2: string (nullable = true)
 |-- Id_3: integer (nullable = true)
 |-- Me_1: integer (nullable = true)
 |-- Me_2: integer (nullable = true)



## 1.2 Sum
We can do a count without order by
DS_r :=count ( DS_1 over ( partition by Id_1 ) )


In [7]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_sum=df.withColumn(new_col_name,sum(target_col_name).over(win_name))
df_sum.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|sum_Me_1|
+----+----+----+----+----+--------+
|   A|  XX|2000|   3|   1|      49|
|   A|  XX|2001|   4|   9|      49|
|   A|  XX|2002|   7|   5|      49|
|   A|  XX|2003|   6|   8|      49|
|   A|  YY|2000|   9|   3|      49|
|   A|  YY|2001|   5|   4|      49|
|   A|  YY|2002|  10|   2|      49|
|   A|  YY|2003|   5|   7|      49|
+----+----+----+----+----+--------+



We can also do without partition

DS_r := sum ( DS_1 over ( order by Id_1, Id_2, Id_3 ) )

In [8]:
partition_col_names=["Id_1","Id_2","Id_3"]
win_name=Window.orderBy(partition_col_names)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_sum=df.withColumn(new_col_name,sum(target_col_name).over(win_name))
df_sum.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|sum_Me_1|
+----+----+----+----+----+--------+
|   A|  XX|2000|   3|   1|       3|
|   A|  XX|2001|   4|   9|       7|
|   A|  XX|2002|   7|   5|      14|
|   A|  XX|2003|   6|   8|      20|
|   A|  YY|2000|   9|   3|      29|
|   A|  YY|2001|   5|   4|      34|
|   A|  YY|2002|  10|   2|      44|
|   A|  YY|2003|   5|   7|      49|
+----+----+----+----+----+--------+



22/04/14 14:14:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [9]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name).rowsBetween(-1,1)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_collect=df.withColumn(new_col_name,collect_list(target_col_name).over(win_name))
df_collect.show()

+----+----+----+----+----+----------+
|Id_1|Id_2|Id_3|Me_1|Me_2|  sum_Me_1|
+----+----+----+----+----+----------+
|   A|  XX|2000|   3|   1|    [3, 4]|
|   A|  XX|2001|   4|   9| [3, 4, 7]|
|   A|  XX|2002|   7|   5| [4, 7, 6]|
|   A|  XX|2003|   6|   8| [7, 6, 9]|
|   A|  YY|2000|   9|   3| [6, 9, 5]|
|   A|  YY|2001|   5|   4|[9, 5, 10]|
|   A|  YY|2002|  10|   2|[5, 10, 5]|
|   A|  YY|2003|   5|   7|   [10, 5]|
+----+----+----+----+----+----------+



In [10]:
partition_col_name=["Id_1","Id_2"]
order_col_name=["Id_3"]
win_name=Window.partitionBy(partition_col_name).orderBy(order_col_name).rangeBetween(-3,3)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_collect1=df.withColumn(new_col_name,collect_list(target_col_name).over(win_name))
df_collect1.show()


+----+----+----+----+----+-------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|     sum_Me_1|
+----+----+----+----+----+-------------+
|   A|  YY|2000|   9|   3|[9, 5, 10, 5]|
|   A|  YY|2001|   5|   4|[9, 5, 10, 5]|
|   A|  YY|2002|  10|   2|[9, 5, 10, 5]|
|   A|  YY|2003|   5|   7|[9, 5, 10, 5]|
|   A|  XX|2000|   3|   1| [3, 4, 7, 6]|
|   A|  XX|2001|   4|   9| [3, 4, 7, 6]|
|   A|  XX|2002|   7|   5| [3, 4, 7, 6]|
|   A|  XX|2003|   6|   8| [3, 4, 7, 6]|
+----+----+----+----+----+-------------+



## 1.3 min

DS_r := min ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )


In [11]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name).orderBy(order_by_col)
target_col_name="Me_1"
new_col_name=f"min_{target_col_name}"
df_min=df.withColumn(new_col_name,min(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|min_Me_1|
+----+----+----+----+----+--------+
|   A|  YY|2000|   9|   3|       9|
|   A|  YY|2001|   5|   4|       5|
|   A|  YY|2002|  10|   2|       5|
|   A|  YY|2003|   5|   7|       5|
|   A|  XX|2000|   3|   1|       3|
|   A|  XX|2001|   4|   9|       3|
|   A|  XX|2002|   7|   5|       3|
|   A|  XX|2003|   6|   8|       3|
+----+----+----+----+----+--------+



## 1.4 max

DS_r := max ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )

In [12]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name).orderBy(order_by_col)
target_col_name="Me_1"
new_col_name=f"max_{target_col_name}"
df_min=df.withColumn(new_col_name,max(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|max_Me_1|
+----+----+----+----+----+--------+
|   A|  YY|2000|   9|   3|       9|
|   A|  YY|2001|   5|   4|       9|
|   A|  YY|2002|  10|   2|      10|
|   A|  YY|2003|   5|   7|      10|
|   A|  XX|2000|   3|   1|       3|
|   A|  XX|2001|   4|   9|       4|
|   A|  XX|2002|   7|   5|       7|
|   A|  XX|2003|   6|   8|       7|
+----+----+----+----+----+--------+



## 1.5 avg

DS_r := avg ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )


In [13]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name).orderBy(order_by_col)
target_col_name="Me_1"
new_col_name=f"avg_{target_col_name}"
df_min=df.withColumn(new_col_name,avg(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+-----------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|         avg_Me_1|
+----+----+----+----+----+-----------------+
|   A|  YY|2000|   9|   3|              9.0|
|   A|  YY|2001|   5|   4|              7.0|
|   A|  YY|2002|  10|   2|              8.0|
|   A|  YY|2003|   5|   7|             7.25|
|   A|  XX|2000|   3|   1|              3.0|
|   A|  XX|2001|   4|   9|              3.5|
|   A|  XX|2002|   7|   5|4.666666666666667|
|   A|  XX|2003|   6|   8|              5.0|
+----+----+----+----+----+-----------------+



## 1.6 median


DS_r := median ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )

percentile_approx with partitionBy followed by orderBy will do rolling median. To have median of each partition, you must only have partitionBy in your window definition.

In [14]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"avg_{target_col_name}"
df_min=df.withColumn(new_col_name,percentile_approx(target_col_name,0.5,10000000).over(win_name))
df_min.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|avg_Me_1|
+----+----+----+----+----+--------+
|   A|  YY|2000|   9|   3|       5|
|   A|  YY|2001|   5|   4|       5|
|   A|  YY|2002|  10|   2|       5|
|   A|  YY|2003|   5|   7|       5|
|   A|  XX|2000|   3|   1|       4|
|   A|  XX|2001|   4|   9|       4|
|   A|  XX|2002|   7|   5|       4|
|   A|  XX|2003|   6|   8|       4|
+----+----+----+----+----+--------+



## 1.7 stddev_pop

The operator returns the “population standard deviation” of the input values.

DS_r := stddev_pop ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )

In [15]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"stddev_pop_{target_col_name}"
df_min=df.withColumn(new_col_name,stddev_pop(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+------------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|   stddev_pop_Me_1|
+----+----+----+----+----+------------------+
|   A|  YY|2000|   9|   3| 2.277608394786075|
|   A|  YY|2001|   5|   4| 2.277608394786075|
|   A|  YY|2002|  10|   2| 2.277608394786075|
|   A|  YY|2003|   5|   7| 2.277608394786075|
|   A|  XX|2000|   3|   1|1.5811388300841895|
|   A|  XX|2001|   4|   9|1.5811388300841895|
|   A|  XX|2002|   7|   5|1.5811388300841895|
|   A|  XX|2003|   6|   8|1.5811388300841895|
+----+----+----+----+----+------------------+



## 1.8 stddev_samp

The operator returns the “sample standard deviation” of the input values.

DS_r :=  stddev_stamp ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )

In [16]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"stddev_samp_{target_col_name}"
df_min=df.withColumn(new_col_name,stddev_samp(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+------------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|  stddev_samp_Me_1|
+----+----+----+----+----+------------------+
|   A|  YY|2000|   9|   3|2.6299556396765835|
|   A|  YY|2001|   5|   4|2.6299556396765835|
|   A|  YY|2002|  10|   2|2.6299556396765835|
|   A|  YY|2003|   5|   7|2.6299556396765835|
|   A|  XX|2000|   3|   1|1.8257418583505536|
|   A|  XX|2001|   4|   9|1.8257418583505536|
|   A|  XX|2002|   7|   5|1.8257418583505536|
|   A|  XX|2003|   6|   8|1.8257418583505536|
+----+----+----+----+----+------------------+



## 1.9 var_pop


DS_r :=  var_pop ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )
The operator returns the “population variance” of the input values

In [17]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"var_pop_{target_col_name}"
df_min=df.withColumn(new_col_name,var_pop(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+------------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|      var_pop_Me_1|
+----+----+----+----+----+------------------+
|   A|  YY|2000|   9|   3|            5.1875|
|   A|  YY|2001|   5|   4|            5.1875|
|   A|  YY|2002|  10|   2|            5.1875|
|   A|  YY|2003|   5|   7|            5.1875|
|   A|  XX|2000|   3|   1|2.4999999999999996|
|   A|  XX|2001|   4|   9|2.4999999999999996|
|   A|  XX|2002|   7|   5|2.4999999999999996|
|   A|  XX|2003|   6|   8|2.4999999999999996|
+----+----+----+----+----+------------------+



## 1.10 var_samp


DS_r :=  var_samp ( DS_1 over ( partition by Id_1, Id_2 order by Id_3 ) )

The operator returns the “sample variance” of the input values

In [18]:
partition_col_names=["Id_1","Id_2"]
order_by_col=["Id_3"]
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"var_samp_{target_col_name}"
df_min=df.withColumn(new_col_name,var_samp(target_col_name).over(win_name))
df_min.show()

+----+----+----+----+----+------------------+
|Id_1|Id_2|Id_3|Me_1|Me_2|     var_samp_Me_1|
+----+----+----+----+----+------------------+
|   A|  YY|2000|   9|   3| 6.916666666666667|
|   A|  YY|2001|   5|   4| 6.916666666666667|
|   A|  YY|2002|  10|   2| 6.916666666666667|
|   A|  YY|2003|   5|   7| 6.916666666666667|
|   A|  XX|2000|   3|   1|3.3333333333333326|
|   A|  XX|2001|   4|   9|3.3333333333333326|
|   A|  XX|2002|   7|   5|3.3333333333333326|
|   A|  XX|2003|   6|   8|3.3333333333333326|
+----+----+----+----+----+------------------+

