In [13]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import col, min, sum, collect_list

In [2]:
local = True

if local:
    spark = SparkSession.builder \
        .master("local[4]") \
        .appName("VTLPerformanceTest") \
        .getOrCreate()
    sf_fire_path = "/home/pliu/data_set/sf_fire"
else:
    spark = SparkSession.builder \
        .master("k8s://https://kubernetes.default.svc:443") \
        .appName("VTLPerformanceTest") \
        .config("spark.kubernetes.container.image", "inseefrlab/jupyter-datascience:py3.9.7-spark3.2.0")\
        .config("spark.kubernetes.authenticate.driver.serviceAccountName", os.environ['KUBERNETES_SERVICE_ACCOUNT'])\
        .config("spark.executor.instances", "4")\
        .config("spark.executor.memory", "8g")\
        .config("spark.kubernetes.namespace", os.environ['KUBERNETES_NAMESPACE'])\
        .getOrCreate()
    sf_fire_path = "s3a://pengfei/diffusion/data_format/sf_fire/parquet/spark_sf_fire_snappy"

22/03/04 11:55:10 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 192.168.184.146 instead (on interface ens33)
22/03/04 11:55:10 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/04 11:55:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
df = spark.read.parquet(sf_fire_path)

                                                                                

In [4]:
df.show(5, truncate=False)

22/03/04 11:55:23 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+----------+------+--------------+-----------------------------+----------+----------+----------------------+----------------------+----------------------+----------------------+----------------------+-------------+------------+--------------------+----------------------+------------------------------------+-------------+-----------------+---------+-----------+----+----------------+--------+-------------+-------+-------------+--------------+--------+--------------------------+----------------------+------------------+--------------------+-------------+---------------------------------------------+
|CallNumber|UnitID|IncidentNumber|CallType                     |CallDate  |WatchDate |ReceivedDtTm          |EntryDtTm             |DispatchDtTm          |ResponseDtTm          |OnSceneDtTm           |TransportDtTm|HospitalDtTm|CallFinalDisposition|AvailableDtTm         |Address                             |City         |ZipcodeofIncident|Battalion|StationArea|Box |OriginalPriority|Priority|F

                                                                                

In [5]:
def get_actions():
    components=[]
    # agg function 1
    component1 = {
        "action": "sum",
        "colName": "NumberofAlarms",
        "targetColName": "sumNumberofAlarms"}

    component2 = {
        "action": "min",
        "colName": "NumberofAlarms",
        "targetColName": "minNumberofAlarms"}
    component3 = {
        "action":  "avg",
        "colName": "NumberofAlarms",
        "targetColName": "avgNumberofAlarms"}

    component4 = {
        "action": "max",
        "colName": "NumberofAlarms",
        "targetColName": "maxNumberofAlarms"}

    component5 = {
        "action":  "count",
        "colName": "*",
        "targetColName": "countVal"}

    component6 = {
        "action": "collect_list",
        "colName": "City",
        "targetColName": "allCities"}
    components.append(component1)
    components.append(component2)
    components.append(component3)
    components.append(component4)
    components.append(component5)
    components.append(component6)
    return components

In [6]:
def joinIndividualAggExample(df, groupByColName, components, joinColName):
    dfs = []

    # here I simulate the logic of each componentExpressionVisitor.visit(groupFunctionCtx.expr()) in aggrClause.visitAggrClause()
    for component in components:
        action=component["action"]
        colName=component["colName"]
        targetColName=component["targetColName"]
        df_group = df.groupBy(groupByColName)
        if action=="min":
            df_min = df_group.min(colName).alias(targetColName)
            dfs.append(df_min)
        elif action=="max":
            df_max = df_group.max(colName).alias(targetColName)
            dfs.append(df_max)
        elif action=="avg":
            df_avg = df_group.avg(colName).alias(targetColName)
            dfs.append(df_avg)
        elif action=="sum":
            df_sum = df_group.sum(colName).alias(targetColName)
            dfs.append(df_sum)
        elif action=="count":
            df_count = df_group.count().alias(targetColName)
            dfs.append(df_count)
        elif action=="collect_list":
            df_collect = df_group.agg(collect_list(colName).alias(targetColName))
            dfs.append(df_collect)
        else:
            print("Unknown aggregation action")

    # join the result
    dfl = dfs[0]
    for i in range(1,len(dfs)):
        dfr = dfs[i]
        dfl = dfl.join(dfr, joinColName)
    dfl.show(5)

In [10]:
# run join
groupByColName="CallType"
components=get_actions()
joinIndividualAggExample(df, groupByColName, components, groupByColName)

                                                                                

+--------------------+-------------------+-------------------+-------------------+-------------------+-----+--------------------+
|            CallType|sum(NumberofAlarms)|min(NumberofAlarms)|avg(NumberofAlarms)|max(NumberofAlarms)|count|           allCities|
+--------------------+-------------------+-------------------+-------------------+-------------------+-----+--------------------+
|Elevator / Escala...|              14840|                  1|                1.0|                  1|14840|[San Francisco, S...|
|         Marine Fire|                425|                  1|                1.0|                  1|  425|[Presidio, Presid...|
|  Aircraft Emergency|               1511|                  1|                1.0|                  1| 1511|[SFO, SFO, SFO, S...|
|Confined Space / ...|                631|                  1|                1.0|                  1|  631|[San Francisco, S...|
|      Administrative|                286|                  1|                1.0|        

In [8]:
def aggregateDynamicExample(df, groupByColName,components):
    temptableName = "users"
    full_agg_expr = ""
    # simulate dynamic full spark sql query building in processingEngine.executeAggr()
    for i in range(0, len(components)):
        com = components[i]
        action:str = com["action"]
        colName= com["colName"]
        targetColName= com["targetColName"]
        if action == "count":
            msg = f"{action}(*) as {targetColName}"
        else:
            msg = f"{action}({colName}) as {targetColName}"

        if i == len(components) - 1:
            full_agg_expr = full_agg_expr + msg + " "
        else:
            full_agg_expr = full_agg_expr + msg + ", "
    full_message = f"Select {groupByColName}, {full_agg_expr} from {temptableName} group by {groupByColName}"
    print(full_message)
    df.createOrReplaceTempView(temptableName)
    result = spark.sql(full_message)
    result.show(5)

In [9]:
aggregateDynamicExample(df,groupByColName,components)

Select CallType, sum(NumberofAlarms) as sumNumberofAlarms, min(NumberofAlarms) as minNumberofAlarms, avg(NumberofAlarms) as avgNumberofAlarms, max(NumberofAlarms) as maxNumberofAlarms, count(*) as countVal, collect_list(City) as allCities  from users group by CallType


                                                                                

+--------------------+-----------------+-----------------+-----------------+-----------------+--------+--------------------+
|            CallType|sumNumberofAlarms|minNumberofAlarms|avgNumberofAlarms|maxNumberofAlarms|countVal|           allCities|
+--------------------+-----------------+-----------------+-----------------+-----------------+--------+--------------------+
|Elevator / Escala...|            14840|                1|              1.0|                1|   14840|[San Francisco, S...|
|         Marine Fire|              425|                1|              1.0|                1|     425|[Presidio, Presid...|
|  Aircraft Emergency|             1511|                1|              1.0|                1|    1511|[SFO, SFO, SFO, S...|
|Confined Space / ...|              631|                1|              1.0|                1|     631|[San Francisco, S...|
|      Administrative|              286|                1|              1.0|                1|     286|[San Francisco, S...|


In [14]:
df.groupBy("CallType").agg(min(col()),
                           sum("NumberofAlarms"),
                           collect_list("City"),
                           max()).show(5)

                                                                                

+--------------------+-------------------+-------------------+--------------------+
|            CallType|min(NumberofAlarms)|sum(NumberofAlarms)|  collect_list(City)|
+--------------------+-------------------+-------------------+--------------------+
|Elevator / Escala...|                  1|              14840|[San Francisco, S...|
|         Marine Fire|                  1|                425|[Presidio, Presid...|
|  Aircraft Emergency|                  1|               1511|[SFO, SFO, SFO, S...|
|Confined Space / ...|                  1|                631|[San Francisco, S...|
|      Administrative|                  1|                286|[San Francisco, S...|
+--------------------+-------------------+-------------------+--------------------+
only showing top 5 rows



                                                                                