In [1]:
# Import app configuration
from config import APP_CONFIG

In [2]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = APP_CONFIG.get('spark.master', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', APP_CONFIG.get('spark.driver.memory', '4g'))
spark_conf.set('spark.executor.memory', APP_CONFIG.get('spark.executor.memory', '5g'))
spark_conf.set('spark.executor.cores', APP_CONFIG.get('spark.executor.cores', '3'))
spark_conf.set('spark.executor.instances', APP_CONFIG.get('spark.executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', APP_CONFIG.get('spark.dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=APP_CONFIG.get('spark.name', 'spark-app'))
                .master(master=APP_CONFIG.get('spark.master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(APP_CONFIG.get('spark.logLevel', 'WARN'))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/13 18:41:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read input dataframes
ec_metadata_df: DataFrame = spark_session.read.parquet(APP_CONFIG.get('input.path.metadata-request'), header=True)
input_to_analyze_df: DataFrame = spark_session.read.csv(APP_CONFIG.get('input.path.analyzeFile'), header=True)

In [4]:
# Deduplicate requests
deduplicates_df: DataFrame = ec_metadata_df \
    .withColumn('version', ec_metadata_df.version.cast('integer')) \
    .filter(ec_metadata_df.paperMeta_productType.isNotNull()) \
    .groupby(ec_metadata_df.requestId) \
    .agg(max('version').alias('version')) \
    .sort(desc('version'))

deduplicates_df = deduplicates_df.withColumnRenamed('requestId', 'maxRequestId')
deduplicates_df = deduplicates_df.withColumnRenamed('version', 'maxVersion')

# deduplicates_df.show(n=5, truncate=True)

In [5]:
# Postalizzazione basic - TODO: enhance selection pipeline
all_paper_metadata_df: DataFrame = ec_metadata_df \
    .join(deduplicates_df) \
    .where((ec_metadata_df.requestId == deduplicates_df.maxRequestId) & (ec_metadata_df.version == deduplicates_df.maxVersion)) \
    .selectExpr(
        'if (length(requestTimestamp) = 17, concat(substr(requestTimestamp, 0, 16), ":00Z"), requestTimestamp) as requestTimestamp',
        'paperMeta_productType',
        'array_join(transform(event_list, e -> e.paperProg_statusCode), " ") as statusesString',
        'array_join(transform(filter(event_list, e -> e.paperProg_statusCode rlike "CON080|CON016|(CON9.*)|(RECRN.*)|(RECAG.*)|(RECRS.*)|(P.*)|(RECRSI.*)|(RECRI.*)"),e -> e.paperProg_statusCode), " ") as businessStatusesString',
        'array_join(array_distinct(transform(event_list,e -> e.paperProg_deliveryFailureCause)), " ") as deliveryFailureCause',
        'array_join(array_distinct(flatten(transform(filter(event_list, e -> e.paperProg_statusCode rlike "(REC.*B)|(REC.*E)").paperProg_attachments,e -> e.documentType))), " ") as attachments',
        'array_join(array_distinct(transform(filter(event_list, e -> e.paperProg_statusCode rlike "REC.*" AND NOT e.paperProg_statusCode in ("RECAG012","REC090")),e -> e.paperProg_registeredLetterCode)), " ") as registeredLetterCode',
        'requestId',
        'regexp_extract(requestId, ".*IUN_(.*)\\.RECINDEX.*", 1) as paperIun',
        'regexp_extract(requestId, "pn-cons-000~(.*)\\.PCRETRY_.", 1) as paperRequestId',
        'version as paperVersion'
    )

# all_paper_metadata_df.show(n=5, truncate=False)

In [6]:
# Filter only sorit metadata
filtered_all_paper_metadata_df: DataFrame = input_to_analyze_df \
    .join(other=all_paper_metadata_df, on=input_to_analyze_df.iun == all_paper_metadata_df.paperIun, how='left')

# filtered_all_paper_metadata_df.show(n=5, truncate=True)

In [7]:
# Filter duplicates by max request id
distinct_request_df: DataFrame = filtered_all_paper_metadata_df \
    .groupby(filtered_all_paper_metadata_df.paperIun) \
    .agg(max(filtered_all_paper_metadata_df.requestId).alias('maxRequestId'))

# distinct_request_df.show(n=5, truncate=True)

In [8]:
# Distinct sorit request
distinct_metadata_df: DataFrame = distinct_request_df.join(filtered_all_paper_metadata_df, on=distinct_request_df.maxRequestId == filtered_all_paper_metadata_df.requestId, how='left')

distinct_metadata_df = distinct_metadata_df \
    .select(
        distinct_metadata_df.iun, 
        distinct_metadata_df.registeredLetterCode,
        distinct_metadata_df.paperMeta_productType.alias('paperMetaProductType'), 
        distinct_metadata_df.attachments, 
        distinct_metadata_df.requestId, 
        distinct_metadata_df.paperRequestId,
        regexp_replace(trim(regexp_replace('businessStatusesString', 'P000|CON[a-zA-Z0-9]*', '')), ' +', '->').alias('sequence')
    )

# distinct_metadata_df.show(n=50, truncate=False)

In [9]:
# Group sequences to create data clusters with the form (each row): <sequence, attachments, count, requestInformationSet>
group_by_sequences_df: DataFrame = distinct_metadata_df \
    .groupby(distinct_metadata_df.sequence, distinct_metadata_df.attachments) \
    .agg(
        count(distinct_metadata_df.iun).alias('count'), 
        collect_set(to_json(struct(distinct_metadata_df.requestId, distinct_metadata_df.paperRequestId, distinct_metadata_df.registeredLetterCode))).cast('string').alias('requestInformationSet')) \
    .sort(desc('count'))

# group_by_sequences_df.show(n=50, truncate=False)

In [20]:
from utils import data_frame_writer

output_format: str = APP_CONFIG.get('output.format')
output_filename: str = APP_CONFIG.get('output.path') + APP_CONFIG.get('spark.name')

data_frame_writer.write(
    df=group_by_sequences_df,
    file_name=output_filename,
    file_format=output_format,
)

# dataframe_writer: DataFrameWriter = group_by_sequences_df.write.mode('overwrite')
# 
# # Use parquet as default output format
# if output_format.lower() == 'csv':
#     dataframe_writer.options(header='True', delimiter=';').csv(output_filename + '.csv')
# else:
#     dataframe_writer.parquet(APP_CONFIG.get('output.path') + APP_CONFIG.get('spark.name') + '.parquet')

                                                                                