In [None]:
import cml.data_v1 as cmldata

import configparser
import uuid
import os
from typing import Dict
from pyspark.sql.functions import to_date, col
from pyspark import SparkConf
from pyspark.sql import SparkSession, DataFrameWriter
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, IntegerType, DateType, StringType
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime, timedelta, date
from pyspark.sql.functions import col, ceil, when
import pandas as pd

# Sample in-code customization of spark configurations
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.cores', '1')
#SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "pdnd-prod-dl-1"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

# Sample usage to run query through spark
EXAMPLE_SQL_QUERY = "show databases"
spark.sql(EXAMPLE_SQL_QUERY).show()

In [None]:
df_gold = spark.sql( """   
                       SELECT  requestid,
                               iun,
                               geokey,
                               accettazione_recapitista_con018_data
                       FROM send.gold_postalizzazione_analytics
                       WHERE accettazione_recapitista_con018_data IS NOT NULL
                       AND senderpaid = "53b40136-65f2-424b-acfb-7fae17e35c60" """   
                    ) 

In [None]:
df_gold.count()

In [None]:
df_gold.createOrReplaceTempView("DF_GOLD")

In [None]:
print(df_gold.head())

In [None]:
#Definire lo schema per il nuovo df di spark che comprende i lotti e regioni associati 
schema = StructType([
    StructField("CAP", StringType(), True), 
    StructField("Regione", StringType(), True)
])

In [None]:
df_cap_regione = spark.read.csv("CAP-Regione.csv", header= True, sep= ";", schema = schema)

In [None]:
df_cap_regione = df_cap_regione.dropDuplicates()

In [None]:
df_cap_regione.count()

In [None]:
df_cap_regione.createOrReplaceTempView("DF_CAP_REGIONE")

In [None]:
print(df_cap_regione)

# Calcolo delle numeriche per regioni e numeriche totali di affidi con con018 <> null

In [None]:
result = spark.sql("""SELECT r.Regione,
                    MONTH(g.accettazione_recapitista_con018_data) AS mese_accettazione,
                    YEAR(g.accettazione_recapitista_con018_data) AS anno_accettazione,
                    COUNT(g.requestid) AS totale_affidi
                    FROM DF_GOLD g JOIN DF_CAP_REGIONE r ON (g.geokey = r.CAP)
                    GROUP BY r.Regione, MONTH(g.accettazione_recapitista_con018_data), YEAR(g.accettazione_recapitista_con018_data) """)

In [None]:
result.count()

# Prova 2 esportare il risultato in tabella

In [None]:
result.createOrReplaceTempView("DF_OUTPUT")

In [None]:
 spark.sql("""SELECT * FROM DF_OUTPUT""").writeTo("send_dev.inps_territori")\
                .using("iceberg")\
                .tableProperty("format-version","2")\
                .tableProperty("engine.hive.enabled","true")\
                .createOrReplace()
#print(datetime.now()-start)