# VTL Simple analytic function



In [13]:
from pyspark.sql import SparkSession,DataFrame
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, LongType, DecimalType
import os
from pyspark.sql.functions import lit, count,sum,avg,collect_list

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLAnalytic")\
        .getOrCreate()
else:
    spark = SparkSession.builder\
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLAnalytic")\
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()

22/04/12 15:45:12 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/04/12 15:45:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/04/12 15:45:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
data=[("A", "XX", 2000, 3, 1),
    ("A", "XX", 2001, 4, 9),
    ("A", "XX", 2002, 7, 5),
    ("A", "XX", 2003, 6, 8),
    ("A", "YY", 2000, 9, 3),
    ("A", "YY", 2001, 5, 4),
    ("A", "YY", 2002, 10, 2),
    ("A", "YY", 2003, 5, 7)]

schema=StructType([StructField("Id_1",StringType(),True),
                   StructField("Id_2",StringType(),True),
                   StructField("Id_3",IntegerType(),True),
                   StructField("Me_1",IntegerType(),True),
                   StructField("Me_2",IntegerType(),True)])

df=spark.createDataFrame(data, schema)
df.show()

+----+----+----+----+----+
|Id_1|Id_2|Id_3|Me_1|Me_2|
+----+----+----+----+----+
|   A|  XX|2000|   3|   1|
|   A|  XX|2001|   4|   9|
|   A|  XX|2002|   7|   5|
|   A|  XX|2003|   6|   8|
|   A|  YY|2000|   9|   3|
|   A|  YY|2001|   5|   4|
|   A|  YY|2002|  10|   2|
|   A|  YY|2003|   5|   7|
+----+----+----+----+----+



In [5]:
from pyspark.sql.window import Window

## 1 Count

count ( DS_1 over ( partition by Id_1 ) )

In [6]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"count_{target_col_name}"
df_count=df.withColumn(new_col_name,count(target_col_name).over(win_name))
df_count.show()

                                                                                

+----+----+----+----+----+----------+
|Id_1|Id_2|Id_3|Me_1|Me_2|count_Me_1|
+----+----+----+----+----+----------+
|   A|  XX|2000|   3|   1|         8|
|   A|  XX|2001|   4|   9|         8|
|   A|  XX|2002|   7|   5|         8|
|   A|  XX|2003|   6|   8|         8|
|   A|  YY|2000|   9|   3|         8|
|   A|  YY|2001|   5|   4|         8|
|   A|  YY|2002|  10|   2|         8|
|   A|  YY|2003|   5|   7|         8|
+----+----+----+----+----+----------+



In [9]:
df.printSchema()

root
 |-- Id_1: string (nullable = true)
 |-- Id_2: string (nullable = true)
 |-- Id_3: integer (nullable = true)
 |-- Me_1: integer (nullable = true)
 |-- Me_2: integer (nullable = true)



## Sum
We can do a count without order by
DS_r :=count ( DS_1 over ( partition by Id_1 ) )


In [11]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_sum=df.withColumn(new_col_name,sum(target_col_name).over(win_name))
df_sum.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|sum_Me_1|
+----+----+----+----+----+--------+
|   A|  XX|2000|   3|   1|      49|
|   A|  XX|2001|   4|   9|      49|
|   A|  XX|2002|   7|   5|      49|
|   A|  XX|2003|   6|   8|      49|
|   A|  YY|2000|   9|   3|      49|
|   A|  YY|2001|   5|   4|      49|
|   A|  YY|2002|  10|   2|      49|
|   A|  YY|2003|   5|   7|      49|
+----+----+----+----+----+--------+



We can also do without partition

DS_r := sum ( DS_1 over ( order by Id_1, Id_2, Id_3 ) )

In [12]:
partition_col_names=["Id_1","Id_2","Id_3"]
win_name=Window.orderBy(partition_col_names)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_sum=df.withColumn(new_col_name,sum(target_col_name).over(win_name))
df_sum.show()

+----+----+----+----+----+--------+
|Id_1|Id_2|Id_3|Me_1|Me_2|sum_Me_1|
+----+----+----+----+----+--------+
|   A|  XX|2000|   3|   1|       3|
|   A|  XX|2001|   4|   9|       7|
|   A|  XX|2002|   7|   5|      14|
|   A|  XX|2003|   6|   8|      20|
|   A|  YY|2000|   9|   3|      29|
|   A|  YY|2001|   5|   4|      34|
|   A|  YY|2002|  10|   2|      44|
|   A|  YY|2003|   5|   7|      49|
+----+----+----+----+----+--------+



22/04/12 16:35:22 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [14]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name).rowsBetween(-1,1)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_collect=df.withColumn(new_col_name,collect_list(target_col_name).over(win_name))
df_collect.show()

+----+----+----+----+----+----------+
|Id_1|Id_2|Id_3|Me_1|Me_2|  sum_Me_1|
+----+----+----+----+----+----------+
|   A|  XX|2000|   3|   1|    [3, 4]|
|   A|  XX|2001|   4|   9| [3, 4, 7]|
|   A|  XX|2002|   7|   5| [4, 7, 6]|
|   A|  XX|2003|   6|   8| [7, 6, 9]|
|   A|  YY|2000|   9|   3| [6, 9, 5]|
|   A|  YY|2001|   5|   4|[9, 5, 10]|
|   A|  YY|2002|  10|   2|[5, 10, 5]|
|   A|  YY|2003|   5|   7|   [10, 5]|
+----+----+----+----+----+----------+



In [16]:
partition_col_name="Id_1"
win_name=Window.partitionBy(partition_col_name).rangeBetween(1,1)
target_col_name="Me_1"
new_col_name=f"sum_{target_col_name}"
df_collect1=df.withColumn(new_col_name,collect_list(target_col_name).over(win_name))
df_collect1.show()


AnalysisException: cannot resolve '(PARTITION BY `Id_1` RANGE BETWEEN 1L FOLLOWING AND 1L FOLLOWING)' due to data type mismatch: A range window frame cannot be used in an unordered window specification.;
'Project [Id_1#0, Id_2#1, Id_3#2, Me_1#3, Me_2#4, collect_list(Me_1#3, 0, 0) windowspecdefinition(Id_1#0, specifiedwindowframe(RangeFrame, 1, 1)) AS sum_Me_1#767]
+- LogicalRDD [Id_1#0, Id_2#1, Id_3#2, Me_1#3, Me_2#4], false
