In [1]:
# Import app configuration
from typing import Dict
from config import APP_CONFIG

spark_config_dict: Dict[str, str] = APP_CONFIG.get('spark', dict())
input_config_dict: Dict[str, str] = APP_CONFIG.get('input', dict())
output_config_dict: Dict[str, str] = APP_CONFIG.get('output', dict())

SPARK_APP_NAME = spark_config_dict.get('name', 'spark-app')

In [2]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = spark_config_dict.get('memory', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', spark_config_dict.get('driver.memory', '4g'))
spark_conf.set('spark.executor.memory', spark_config_dict.get('executor.memory', '5g'))
spark_conf.set('spark.executor.cores', spark_config_dict.get('executor.cores', '3'))
spark_conf.set('spark.executor.instances', spark_config_dict.get('executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', spark_config_dict.get('dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=SPARK_APP_NAME)
                .master(master=spark_config_dict.get('master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(spark_config_dict.get('logLevel', 'WARN'))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/22 18:27:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/02/22 18:27:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Read input dataframes
ec_metadata_df: DataFrame = spark_session.read.parquet(input_config_dict.get('path.metadataRequest'), header=True)
input_to_analyze_df: DataFrame = spark_session.read.csv(input_config_dict.get('path.analyzeFile'), header=True)
sequence_category_mapping_df: DataFrame = spark_session.read.csv('resources/sequence_category_mapping.csv', header=True)

In [None]:
# Deduplicate requests
deduplicates_df: DataFrame = ec_metadata_df \
    .withColumn('version', ec_metadata_df.version.cast('integer')) \
    .filter(ec_metadata_df.paperMeta_productType.isNotNull()) \
    .groupby(ec_metadata_df.requestId) \
    .agg(max('version').alias('version')) \
    .sort(desc('version'))

deduplicates_df = deduplicates_df.withColumnRenamed('requestId', 'maxRequestId')
deduplicates_df = deduplicates_df.withColumnRenamed('version', 'maxVersion')

# deduplicates_df.show(n=5, truncate=True)

In [None]:
# Postalizzazione basic - TODO: enhance selection pipeline

deduplicates_join_condition = [
    ec_metadata_df.requestId == deduplicates_df.maxRequestId,
    ec_metadata_df.version == deduplicates_df.maxVersion
]

all_paper_metadata_df: DataFrame = ec_metadata_df \
    .join(other=deduplicates_df, on=deduplicates_join_condition) \
    .selectExpr(
        'if (length(requestTimestamp) = 17, concat(substr(requestTimestamp, 0, 16), ":00Z"), requestTimestamp) as requestTimestamp',
        'paperMeta_productType',
        'array_join(transform(event_list, e -> e.paperProg_statusCode), " ") as statusesString',
        'array_join(transform(filter(event_list, e -> e.paperProg_statusCode rlike "CON080|CON016|(CON9.*)|(RECRN.*)|(RECAG.*)|(RECRS.*)|(P.*)|(RECRSI.*)|(RECRI.*)"),e -> e.paperProg_statusCode), " ") as businessStatusesString',
        'array_join(array_distinct(transform(event_list,e -> e.paperProg_deliveryFailureCause)), " ") as deliveryFailureCause',
        'array_join(array_distinct(flatten(transform(filter(event_list, e -> e.paperProg_statusCode rlike "(REC.*B)|(REC.*E)").paperProg_attachments,e -> e.documentType))), " ") as attachments',
        'array_join(array_distinct(transform(filter(event_list, e -> e.paperProg_statusCode rlike "REC.*" AND NOT e.paperProg_statusCode in ("RECAG012","REC090")),e -> e.paperProg_registeredLetterCode)), " ") as registeredLetterCode',
        'requestId',
        'regexp_extract(requestId, ".*IUN_(.*)\\.RECINDEX.*", 1) as paperIun',
        'regexp_extract(requestId, "pn-cons-000~(.*)\\.PCRETRY_.", 1) as paperRequestId',
        'version as paperVersion'
    )

# Replace empty string coming from array_join() with empty arrays
all_paper_metadata_df = all_paper_metadata_df.na.replace('', None)

# all_paper_metadata_df.show(n=50, truncate=False)

In [None]:
# Filter only metadata
filtered_all_paper_metadata_df: DataFrame = input_to_analyze_df \
    .join(other=all_paper_metadata_df, on=input_to_analyze_df.iun == all_paper_metadata_df.paperIun, how='left')

# filtered_all_paper_metadata_df.show(n=50, truncate=True)

In [None]:
# Filter duplicates by max request id
distinct_request_df: DataFrame = filtered_all_paper_metadata_df \
    .groupby(filtered_all_paper_metadata_df.paperIun) \
    .agg(max(filtered_all_paper_metadata_df.requestId).alias('maxRequestId'))

# distinct_request_df.show(n=50, truncate=True)

In [None]:
# In this notebook cell we are using col() to retrieve column because distinct_request_df JOIN filtered_all_paper_metadata_df refers to self join operation

# Sequence column contains the status history with all REC like statuses when contain any of it, otherwise the entire history
sequence: Column = regexp_replace(
    trim(
        when(
            col('filtered_all_paper_metadata_df.businessStatusesString').contains('REC'), 
            regexp_replace(col('filtered_all_paper_metadata_df.businessStatusesString'), 'P000|CON[a-zA-Z0-9]*', '')
        ).otherwise(col('filtered_all_paper_metadata_df.businessStatusesString'))
    ), 
    ' +', 
    '->'
)

# Distinct request
distinct_metadata_df: DataFrame = distinct_request_df.alias('distinct_request_df') \
    .join(
        filtered_all_paper_metadata_df.alias('filtered_all_paper_metadata_df'), 
        on=col('distinct_request_df.maxRequestId') == col('filtered_all_paper_metadata_df.requestId'), 
        how='left'
    )

distinct_metadata_df = distinct_metadata_df \
    .select(
        col('filtered_all_paper_metadata_df.iun'),
        col('filtered_all_paper_metadata_df.destzip'),
        col('filtered_all_paper_metadata_df.destforeignstate'),
        col('filtered_all_paper_metadata_df.registeredLetterCode'),
        col('filtered_all_paper_metadata_df.paperMeta_productType').alias('paperMetaProductType'), 
        col('filtered_all_paper_metadata_df.attachments'), 
        col('filtered_all_paper_metadata_df.requestId'), 
        col('filtered_all_paper_metadata_df.paperRequestId'),
        sequence.alias('sequence')
    )

# distinct_metadata_df.show(n=50, truncate=False)

In [None]:
# Group sequences to create data clusters with the form (each row): <sequence, attachments, count, requestInformationSet>
# group_by_sequences_df: DataFrame = distinct_metadata_df \
#     .groupby(distinct_metadata_df.sequence, distinct_metadata_df.attachments) \
#     .agg(
#         count(distinct_metadata_df.iun).alias('requestCount'), 
#         collect_set(to_json(struct(
#             distinct_metadata_df.requestId, 
#             distinct_metadata_df.paperRequestId, 
#             distinct_metadata_df.registeredLetterCode,
#             distinct_metadata_df.destzip,
#             distinct_metadata_df.destforeignstate
#         ))).cast('string').alias('requestInformationSet'))

group_by_sequences_df = distinct_metadata_df.filter(distinct_metadata_df.sequence == 'P000->CON080->CON016->PN999')

group_by_sequences_df.show(n=100, truncate=False)

In [None]:
labeled_join_condition = [
    group_by_sequences_df.sequence.eqNullSafe(sequence_category_mapping_df.mappedSequence), 
    group_by_sequences_df.attachments.eqNullSafe(sequence_category_mapping_df.mappedAttachments)
]

group_by_sequence_labeled_df: DataFrame = group_by_sequences_df \
    .join(sequence_category_mapping_df, on=labeled_join_condition, how='left') \
    .select(
        group_by_sequences_df.requestId,
        group_by_sequences_df.sequence,
        group_by_sequences_df.attachments,
        # group_by_sequences_df.requestCount,
        # group_by_sequences_df.requestInformationSet,
        sequence_category_mapping_df.category,
        sequence_category_mapping_df.action,
    ) \
    #.sort(desc(group_by_sequences_df.requestCount))

group_by_sequence_labeled_df.show(n=500, truncate=False)

In [None]:
recag012_send_only_df: DataFrame = group_by_sequence_labeled_df \
    .filter((group_by_sequence_labeled_df.action == 'SEND') & (group_by_sequence_labeled_df.category == 'REDRIVE_RECAG012')) \
    .select(
        group_by_sequence_labeled_df.requestId,
        group_by_sequence_labeled_df.sequence,
        group_by_sequence_labeled_df.requestId,
        group_by_sequence_labeled_df.action
    )

recag012_send_only_df.show(truncate=False)

In [None]:
# from utils.custom_data_frame_writer import CustomDataFrameWriter
# 
# # Write out dataframe
# CustomDataFrameWriter.write(
#     df=recag012_send_only_df.repartition(1),
#     output_name=SPARK_APP_NAME,
#     output_folder=output_config_dict.get('path'),
#     output_format=output_config_dict.get('format')
# )

In [3]:
request_missing_cap_df: DataFrame = spark_session.read.csv('/Users/lap-mbp16-n01-0346/Documents/Sviluppo/Data Analysis/requests/pn-9966-analisi_spedizioni_recapitista.csv', header=True, sep=';')

request_with_cap_df: DataFrame = spark_session.read.csv('resources/CAP_non_risaliti_cleaned.csv', header=True)

In [4]:
request_missing_cap_with_send_request_df = request_missing_cap_df.select(
    '*',
    regexp_replace(regexp_replace(request_missing_cap_df.idRichiesta, 'pn-cons-000~PREPARE', 'SEND'), '.PCRETRY_.', '').alias('sendRequest')
)

In [5]:
request_with_filledcap_df: DataFrame = request_missing_cap_with_send_request_df \
    .join(request_with_cap_df, on=request_missing_cap_with_send_request_df.sendRequest == request_with_cap_df.event_id) \
    .select(
        request_missing_cap_with_send_request_df.idRichiesta,
        request_missing_cap_with_send_request_df.sequenzaEventi,
        request_missing_cap_with_send_request_df.codiceLettera,
        request_missing_cap_with_send_request_df.prodottoPostale,
        request_missing_cap_with_send_request_df.categoria,
        request_missing_cap_with_send_request_df.descrizione,
        request_with_cap_df.cap,
        request_with_cap_df.stato_estero
    )

In [6]:
from utils.custom_data_frame_writer import CustomDataFrameWriter

# Write out dataframe
CustomDataFrameWriter.write(
    df=request_with_filledcap_df.repartition(1),
    output_name=SPARK_APP_NAME,
    output_folder=output_config_dict.get('path'),
    output_format=output_config_dict.get('format')
)