In [1]:
# Import app configuration
from typing import Dict
from config import APP_CONFIG

spark_config_dict: Dict[str, str] = APP_CONFIG.get('spark', dict())
input_config_dict: Dict[str, str] = APP_CONFIG.get('input', dict())
output_config_dict: Dict[str, str] = APP_CONFIG.get('output', dict())

SPARK_APP_NAME = spark_config_dict.get('name', 'spark-app')

In [2]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = spark_config_dict.get('memory', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', spark_config_dict.get('driver.memory', '4g'))
spark_conf.set('spark.executor.memory', spark_config_dict.get('executor.memory', '5g'))
spark_conf.set('spark.executor.cores', spark_config_dict.get('executor.cores', '3'))
spark_conf.set('spark.executor.instances', spark_config_dict.get('executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', spark_config_dict.get('dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=SPARK_APP_NAME)
                .master(master=spark_config_dict.get('master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(spark_config_dict.get('logLevel', 'WARN'))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/14 17:00:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/14 17:00:54 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Read input dataframes
ec_metadata_df: DataFrame = spark_session.read.parquet(input_config_dict.get('path.metadataRequest'), header=True)
input_to_analyze_df: DataFrame = spark_session.read.csv(input_config_dict.get('path.analyzeFile'), header=True)

In [4]:
# Deduplicate requests
deduplicates_df: DataFrame = ec_metadata_df \
    .withColumn('version', ec_metadata_df.version.cast('integer')) \
    .filter(ec_metadata_df.paperMeta_productType.isNotNull()) \
    .groupby(ec_metadata_df.requestId) \
    .agg(max('version').alias('version')) \
    .sort(desc('version'))

deduplicates_df = deduplicates_df.withColumnRenamed('requestId', 'maxRequestId')
deduplicates_df = deduplicates_df.withColumnRenamed('version', 'maxVersion')

# deduplicates_df.show(n=5, truncate=True)

In [5]:
# Postalizzazione basic - TODO: enhance selection pipeline
all_paper_metadata_df: DataFrame = ec_metadata_df \
    .join(deduplicates_df) \
    .where((ec_metadata_df.requestId == deduplicates_df.maxRequestId) & (ec_metadata_df.version == deduplicates_df.maxVersion)) \
    .selectExpr(
        'if (length(requestTimestamp) = 17, concat(substr(requestTimestamp, 0, 16), ":00Z"), requestTimestamp) as requestTimestamp',
        'paperMeta_productType',
        'array_join(transform(event_list, e -> e.paperProg_statusCode), " ") as statusesString',
        'array_join(transform(filter(event_list, e -> e.paperProg_statusCode rlike "CON080|CON016|(CON9.*)|(RECRN.*)|(RECAG.*)|(RECRS.*)|(P.*)|(RECRSI.*)|(RECRI.*)"),e -> e.paperProg_statusCode), " ") as businessStatusesString',
        'array_join(array_distinct(transform(event_list,e -> e.paperProg_deliveryFailureCause)), " ") as deliveryFailureCause',
        'array_join(array_distinct(flatten(transform(filter(event_list, e -> e.paperProg_statusCode rlike "(REC.*B)|(REC.*E)").paperProg_attachments,e -> e.documentType))), " ") as attachments',
        'array_join(array_distinct(transform(filter(event_list, e -> e.paperProg_statusCode rlike "REC.*" AND NOT e.paperProg_statusCode in ("RECAG012","REC090")),e -> e.paperProg_registeredLetterCode)), " ") as registeredLetterCode',
        'requestId',
        'regexp_extract(requestId, ".*IUN_(.*)\\.RECINDEX.*", 1) as paperIun',
        'regexp_extract(requestId, "pn-cons-000~(.*)\\.PCRETRY_.", 1) as paperRequestId',
        'version as paperVersion'
    )

# all_paper_metadata_df.show(n=5, truncate=False)

In [12]:
data = [
    {
        "requestIdNoRecag": "pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_KWRL-LAUQ-YDRU-202307-H-1.RECINDEX_0.ATTEMPT_0.PCRETRY_0",
    },
    {
        "requestIdNoRecag": "pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_MRPJ-WPRV-PQHZ-202308-Z-1.RECINDEX_0.ATTEMPT_0.PCRETRY_0",
    },
    {
        "requestIdNoRecag": "pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_YZTR-AQAT-LNQH-202308-H-1.RECINDEX_0.ATTEMPT_0.PCRETRY_1",
    },
    {
        "requestIdNoRecag": "pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_VEWH-QZWV-RQNA-202308-X-1.RECINDEX_0.ATTEMPT_0.PCRETRY_1",
    },
    {
        "requestIdNoRecag": "pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_AGKG-AQDV-ZUDN-202307-M-1.RECINDEX_0.ATTEMPT_0.PCRETRY_1",
    },
    {
        "requestIdNoRecag": "pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_LNXV-VDTN-MVEH-202307-D-1.RECINDEX_0.ATTEMPT_0.PCRETRY_0",
    }
]

missing_recag_df: DataFrame = spark_session.createDataFrame(data)

In [14]:
missing_recag_joined_df: DataFrame = all_paper_metadata_df \
    .join(broadcast(missing_recag_df), on=missing_recag_df.requestIdNoRecag == all_paper_metadata_df.requestId) \
    .select('statusesString', 'requestId')

missing_recag_joined_df.show(n=50, truncate=False)



+-----------------------------------------------------+------------------------------------------------------------------------------------------------+
|statusesString                                       |requestId                                                                                       |
+-----------------------------------------------------+------------------------------------------------------------------------------------------------+
|P000 CON010 CON011 CON012 CON09A CON016 CON018 CON080|pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_AGKG-AQDV-ZUDN-202307-M-1.RECINDEX_0.ATTEMPT_0.PCRETRY_1|
|P000 CON996                                          |pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_MRPJ-WPRV-PQHZ-202308-Z-1.RECINDEX_0.ATTEMPT_0.PCRETRY_0|
|P000 CON996                                          |pn-cons-000~PREPARE_ANALOG_DOMICILE.IUN_YZTR-AQAT-LNQH-202308-H-1.RECINDEX_0.ATTEMPT_0.PCRETRY_1|
|P000 CON996                                          |pn-cons-000~PREPARE_ANALOG_

                                                                                

In [None]:
decremented_pc_retry_df: DataFrame = input_to_analyze_df \
    .select(
        input_to_analyze_df.requestId.alias('originalRequestId'),
        concat(
            regexp_extract(input_to_analyze_df.requestId, "pn-cons-000~(.*)\\.PCRETRY_", 0).alias('requestIdWithoutPcRetryNumber'),
            regexp_extract(input_to_analyze_df.requestId, ".PCRETRY_(.*)", 1).cast('integer')-1
        ).alias('requestIdWithDecrementedPcRetryNumber'),
        regexp_extract(input_to_analyze_df.requestId, "pn-cons-000~(.*)\\.PCRETRY_.", 1).alias('paperRequestId')
    )

# decremented_pc_retry_df.show(n=50, truncate=False)

In [None]:
recag013_bloccati2_df: DataFrame = all_paper_metadata_df \
    .join(broadcast(decremented_pc_retry_df), on=all_paper_metadata_df.requestId == decremented_pc_retry_df.requestIdWithDecrementedPcRetryNumber) \
    .select('originalRequestId', 'requestIdWithDecrementedPcRetryNumber', 'statusesString')

# recag013_bloccati2_df.show(n=50, truncate=False)