In [None]:
# Import app configuration
from typing import Dict
from config import APP_CONFIG

spark_config_dict: Dict[str, str] = APP_CONFIG.get('spark', dict())
input_config_dict: Dict[str, str] = APP_CONFIG.get('input', dict())
output_config_dict: Dict[str, str] = APP_CONFIG.get('output', dict())

SPARK_APP_NAME = spark_config_dict.get('name', 'spark-app')

In [None]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = spark_config_dict.get('memory', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', spark_config_dict.get('driver.memory', '4g'))
spark_conf.set('spark.executor.memory', spark_config_dict.get('executor.memory', '5g'))
spark_conf.set('spark.executor.cores', spark_config_dict.get('executor.cores', '3'))
spark_conf.set('spark.executor.instances', spark_config_dict.get('executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', spark_config_dict.get('dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=SPARK_APP_NAME)
                .master(master=spark_config_dict.get('master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(spark_config_dict.get('logLevel', 'WARN'))

In [None]:
ec_metadata_df: DataFrame = spark_session.read.parquet(input_config_dict.get('path.metadataRequest'), header=True) \
    .filter((col('paperMeta_productType') == '890') | (col('paperMeta_productType') == 'AR')) \
    .select(regexp_extract('requestId', '.*IUN_(.*)\\.RECINDEX.*', 1).alias('iun'), 'event_list', 'paperMeta_productType')

notifications_df: DataFrame = spark_session.read.parquet(input_config_dict.get('path.notifications'), header=True) \
    .select('iun', 'senderDenomination', 'senderPaId')

# 051c98e9-40ce-43a6-aa6e-2123889e1532
# request_regione_marche_df: DataFrame = spark_session.read.csv(input_config_dict.get('path.analyzeFile'), header=True)

matrice_costi_2023_pivot_df: DataFrame = spark_session.read.csv(
    path='resources/matrice_costi_2023_pivot.csv'
).withColumnsRenamed({
    '_c0': 'geokey', '_c1': 'product', '_c2': 'recapitista', '_c3': 'lotto', '_c4': 'costo_plico', '_c5': 'costo_foglio', '_c6': 'costo_demat', '_c7': 'min', '_c8': 'max', '_c9': 'costo', '_c10': 'costo_base_20gr'
}).filter(col('product') == '890').dropDuplicates(['geokey', 'product', 'recapitista'])

In [None]:
ec_metadata_df_with_sender_denomination_df: DataFrame = ec_metadata_df \
    .join(notifications_df, on='iun', how='left') \
    .filter(['iun'])

# ec_metadata_df_with_sender_denomination_df.show(truncate=False)

In [None]:
ec_metadata_df_with_sender_denomination_grouped_by_sender_product_df: DataFrame = ec_metadata_df_with_sender_denomination_df \
    .groupBy(ec_metadata_df_with_sender_denomination_df.senderDenomination, ec_metadata_df_with_sender_denomination_df.paperMeta_productType) \
    .agg(count('*').alias('Count')) \
    .select(
        ec_metadata_df_with_sender_denomination_df.senderDenomination.alias('Ente'), 
        ec_metadata_df_with_sender_denomination_df.paperMeta_productType.alias('Prodotto'),
        'Count'
    ).sort(desc('Count'))

# ec_metadata_df_with_sender_denomination_grouped_by_sender_product_df.show(n=500, truncate=False)

In [None]:
from utils.custom_data_frame_writer import CustomDataFrameWriter

# Write out dataframe
CustomDataFrameWriter.write(
    df=ec_metadata_df_with_sender_denomination_grouped_by_sender_product_df.repartition(1),
    output_name=SPARK_APP_NAME,
    output_folder=output_config_dict.get('path'),
    output_format=output_config_dict.get('format')
)

In [None]:
filtered_event_con018_in_november_column = filter(
    ec_metadata_df_with_sender_denomination_df.event_list,
    lambda event: rlike(event.paperProg_statusCode, lit("CON018")) & (month(event.paperProg_statusDateTime) == 11)
)

filtered_event_df: DataFrame = (ec_metadata_df_with_sender_denomination_df
    .filter(size(filtered_event_con018_in_november_column) > 0)
    .select(transform(ec_metadata_df_with_sender_denomination_df.event_list, lambda event:
        struct(
            lit(event.paperProg_registeredLetterCode).alias('registeredLetterCode'),
            ec_metadata_df_with_sender_denomination_df.iun,
            ec_metadata_df_with_sender_denomination_df.senderDenomination,
            ec_metadata_df_with_sender_denomination_df.senderPaId,
            ec_metadata_df_with_sender_denomination_df.paperMeta_productType.alias('productType')
        )).alias('eventStruct')))

In [None]:
event_select_unwrapped_df: DataFrame = filtered_event_df.select('*', inline('eventStruct')).drop('eventStruct')

In [None]:
# f3d2d709-aee9-4246-82cd-5c17ca9770e5 = COMUNE DI MORTARA
event_select_unwrapped_filtered_by_sender_pa_df: DataFrame = event_select_unwrapped_df.filter(
    (event_select_unwrapped_df.registeredLetterCode.isNotNull()) & (event_select_unwrapped_df.senderPaId == 'f3d2d709-aee9-4246-82cd-5c17ca9770e5')
).dropDuplicates(['registeredLetterCode', 'iun'])

In [None]:
event_select_unwrapped_filtered_by_sender_pa_with_cap_df: DataFrame = event_select_unwrapped_filtered_by_sender_pa_df \
    .join(request_with_cap_comune_mortara_df, on='iun', how='left') \
    .select(
        event_select_unwrapped_filtered_by_sender_pa_df.iun,
        request_with_cap_comune_mortara_df.preparerequestid,
        event_select_unwrapped_filtered_by_sender_pa_df.registeredLetterCode,
        event_select_unwrapped_filtered_by_sender_pa_df.senderDenomination,
        event_select_unwrapped_filtered_by_sender_pa_df.senderPaId,
        event_select_unwrapped_filtered_by_sender_pa_df.productType,
        request_with_cap_comune_mortara_df.destzip,
        request_with_cap_comune_mortara_df.destforeignstate
    )

# event_select_unwrapped_filtered_by_sender_pa_with_cap_df.show(truncate=False)

In [None]:
request_comune_mortara_890_nov_with_lotto_recapitista_letter_df: DataFrame = event_select_unwrapped_filtered_by_sender_pa_with_cap_df \
    .join(matrice_costi_2023_pivot_df, on=event_select_unwrapped_filtered_by_sender_pa_with_cap_df.destzip == matrice_costi_2023_pivot_df.geokey) \
    .select(
        event_select_unwrapped_filtered_by_sender_pa_with_cap_df.iun,
        event_select_unwrapped_filtered_by_sender_pa_with_cap_df.preparerequestid,
        event_select_unwrapped_filtered_by_sender_pa_with_cap_df.registeredLetterCode,
        event_select_unwrapped_filtered_by_sender_pa_with_cap_df.senderDenomination,
        event_select_unwrapped_filtered_by_sender_pa_with_cap_df.senderPaId,
        event_select_unwrapped_filtered_by_sender_pa_with_cap_df.productType,
        matrice_costi_2023_pivot_df.lotto,
        matrice_costi_2023_pivot_df.recapitista
    )

# request_comune_mortara_890_nov_with_lotto_recapitista_letter_df.show(truncate=False)

In [None]:
from utils.custom_data_frame_writer import CustomDataFrameWriter

# Write out dataframe
CustomDataFrameWriter.write(
    df=request_comune_mortara_890_nov_with_lotto_recapitista_letter_df.repartition(1),
    output_name=SPARK_APP_NAME,
    output_folder=output_config_dict.get('path'),
    output_format=output_config_dict.get('format')
)