In [None]:
# Import app configuration
from typing import Dict
from config import APP_CONFIG

spark_config_dict: Dict[str, str] = APP_CONFIG.get('spark', dict())
input_config_dict: Dict[str, str] = APP_CONFIG.get('input', dict())
output_config_dict: Dict[str, str] = APP_CONFIG.get('output', dict())

SPARK_APP_NAME = spark_config_dict.get('name', 'spark-app')

In [None]:
import os

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.dataframe import DataFrame

os.environ['SPARK_MEM'] = spark_config_dict.get('memory', '24g')

spark_conf = SparkConf()
spark_conf.set('spark.driver.memory', spark_config_dict.get('driver.memory', '4g'))
spark_conf.set('spark.executor.memory', spark_config_dict.get('executor.memory', '5g'))
spark_conf.set('spark.executor.cores', spark_config_dict.get('executor.cores', '3'))
spark_conf.set('spark.executor.instances', spark_config_dict.get('executor.instances', '4'))
spark_conf.set('spark.dynamicAllocation.enabled', spark_config_dict.get('dynamicAllocation.enabled', 'false'))

# Configure and start new Spark Session
spark_session = (SparkSession.builder
                .appName(name=SPARK_APP_NAME)
                .master(master=spark_config_dict.get('master', 'local'))
                .config(conf=spark_conf)
                .getOrCreate())

spark_session.sparkContext.setLogLevel(spark_config_dict.get('logLevel', 'WARN'))

In [None]:
notifications_df: DataFrame = spark_session.read.parquet(input_config_dict.get('path.notifications'), header=True) \
    .filter(col('senderPaId') == '051c98e9-40ce-43a6-aa6e-2123889e1532') \
    .select('iun', 'senderDenomination', 'senderPaId', 'sentAt')

request_df: DataFrame = spark_session.read.json('/Users/lap-mbp16-n01-0346/Documents/Sviluppo/Data Analysis/requests/con996-20240307.json') \
    .select(col('requestId').S.alias('requestId')).dropna()

In [None]:
request_df.filter(request_df.requestId == 'PREPARE_ANALOG_DOMICILE.IUN_HQUK-LTVJ-QREA-202403-G-1.RECINDEX_0.ATTEMPT_0').show(truncate=False)

In [None]:
# notifications_2024_marche.csv
notifications_2024_marche_df: DataFrame = spark_session.read.csv('/Users/lap-mbp16-n01-0346/Documents/Sviluppo/Data Analysis/requests/notifications_2024_marche.csv', header=True) \
    .select('iun', col('senderdenomination').alias('senderDenomination'), col('senderpaid').alias('senderPaId'), col('sentat').alias('sentAt'))

In [None]:
notifications_complete_df: DataFrame = notifications_df.union(notifications_2024_marche_df)
notifications_complete_df = notifications_complete_df.dropDuplicates(['iun'])

In [None]:
request_with_iun_df: DataFrame = request_df.select(request_df.requestId, regexp_extract(request_df.requestId, ".*IUN_(.*)\\.RECINDEX.*", 1).alias('iun'))

In [None]:
request_with_iun_df \
    .join(notifications_complete_df, on='iun') \
    .select('*', month(notifications_complete_df.sentAt).alias('month'), year(notifications_complete_df.sentAt).alias('year')) \
    .filter(col('month') == 3) \
    .count()
    # .groupBy(month(notifications_complete_df.sentAt).alias('month'), year(notifications_complete_df.sentAt).alias('year'), notifications_complete_df.senderPaId, notifications_complete_df.senderDenomination) \
    # .agg(
    #     count('*').alias('count')
    # )

In [None]:
regine_marche_from_impala_df: DataFrame = spark_session.read.csv('/Users/lap-mbp16-n01-0346/Documents/Sviluppo/Data Analysis/requests/con996_marche_impala.csv', header=True).select('iun')

In [None]:
regine_marche_from_impala_df.show(truncate=False)

In [None]:
regine_marche_from_impala_df = regine_marche_from_impala_df.alias('a').join(request_with_iun_df.alias('b'), on='iun', how='left').filter(col('b.requestId').isNull())

In [None]:
regine_marche_from_impala_df.show(truncate=False)

In [None]:
from utils.custom_data_frame_writer import CustomDataFrameWriter

# Write out dataframe
CustomDataFrameWriter.write(
    df=request_with_sender_df.repartition(1),
    output_name=SPARK_APP_NAME,
    output_folder=output_config_dict.get('path'),
    output_format=output_config_dict.get('format'),
    partition_by='month'
)