In [1]:
# Import app configuration
from typing import Dict
from config import APP_CONFIG

spark_config_dict: Dict[str, str] = APP_CONFIG.get('spark', dict())
input_config_dict: Dict[str, str] = APP_CONFIG.get('input', dict())
output_config_dict: Dict[str, str] = APP_CONFIG.get('output', dict())

SPARK_APP_NAME = spark_config_dict.get('name', 'spark-app')

In [2]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = spark_config_dict.get('memory', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', spark_config_dict.get('driver.memory', '4g'))
spark_conf.set('spark.executor.memory', spark_config_dict.get('executor.memory', '5g'))
spark_conf.set('spark.executor.cores', spark_config_dict.get('executor.cores', '3'))
spark_conf.set('spark.executor.instances', spark_config_dict.get('executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', spark_config_dict.get('dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=SPARK_APP_NAME)
                .master(master=spark_config_dict.get('master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(spark_config_dict.get('logLevel', 'WARN'))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/21 09:48:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
ec_metadata_df: DataFrame = spark_session.read.parquet(input_config_dict.get('path.metadataRequest'), header=True)

ec_metadata_df.sort(desc('requestTimestamp')).show(truncate=False)



+-----------------------------+-------------------------------------------------------------------------------------------------------------------+---------------+-------------------+---------------------------------------------------------------------------------------------------+---------------------+---------------------+-------------------+--------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+-------------+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [7]:
ec_metadata_df.sort(asc('requestTimestamp')).show(truncate=False)



+-----------------------------+-----------------------------------------------------------------------------------------------------------+---------------+-------------------+-------------------------------------------------------------------------------------------+---------------------+---------------------+-------------------+--------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+-------------+-------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [None]:
filtered_event_list_column: Column = filter(
    ec_metadata_df.event_list,
    lambda event: rlike(event.paperProg_statusCode, lit("(?!REC090)(CON*|REC*)"))
)

filtered_event_df: DataFrame = ec_metadata_df \
    .select(transform(filtered_event_list_column, lambda event:
        struct(
            abs(datediff(event.paperProg_statusDateTime, event.paperProg_clientRequestTimeStamp)).alias('dayDiff'),
            when(isnotnull(event.paperProg_status), event.paperProg_status).otherwise(event.paperProg_statusDescription).alias('status'),
            when(isnotnull(event.paperProg_statusDescription), event.paperProg_statusDescription).otherwise(event.paperProg_status).alias('statusDescription'),
            ec_metadata_df.requestId.alias('request')
        )).alias('dayDiffStatus'))

filtered_event_df.show(truncate=False)

In [None]:
event_select_unwrapped_df: DataFrame = filtered_event_df.select('*', inline('dayDiffStatus')).drop('dayDiffStatus')
event_select_unwrapped_df.show(truncate=False)

In [None]:
event_grouped_by_daydiff_and_status_df: DataFrame = event_select_unwrapped_df \
    .groupBy(
        event_select_unwrapped_df.dayDiff, event_select_unwrapped_df.status, event_select_unwrapped_df.statusDescription
    ).agg(
        count('*').alias('count'),
        #collect_set(event_select_unwrapped_df.request)
    ).sort('dayDiff')

event_grouped_by_daydiff_and_status_df = event_grouped_by_daydiff_and_status_df.filter(event_grouped_by_daydiff_and_status_df.dayDiff.isNotNull())

# event_grouped_by_daydiff_and_status_df.show(truncate=False)

In [None]:
from utils.custom_data_frame_writer import CustomDataFrameWriter

# Write out dataframe
CustomDataFrameWriter.write(
    df=event_grouped_by_daydiff_and_status_df.repartition(1),
    output_name=SPARK_APP_NAME,
    output_folder=output_config_dict.get('path'),
    output_format=output_config_dict.get('format')
)