In [None]:
# ====== Setup di base (CML) ======
import cml.data_v1 as cmldata
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import os, re

# Usa la connection del workspace (chiedi all'admin il nome esatto se diverso)
CONNECTION_NAME = "pdnd-prod-dl-1"
conn = cmldata.get_connection(CONNECTION_NAME)
spark: SparkSession = conn.get_spark_session()
print("Spark enabled:", spark.version)

# Un po' di "igiene" per ridurre l'impatto
spark.conf.set("spark.sql.shuffle.partitions", "64")  # abbassa shuffles
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")  # evita errori Arrow

# ====== Parametri facili da toccare ======
DATE_FROM        = "2025-01-01 00:00:00"  # filtro data minimo
TRANSFER_CAT_LIKE= "%0101100IM%"          # macro categoria
SAMPLE_FRACTION  = 0.05                    # 5% random sample lato Spark (None per disattivare)
MAX_ROWS         = 100_000                 # limite hard (0 o None = nessun limite, sconsigliato in CML)
SHOW_ROWS        = 10                      # righe da mostrare di esempio

# Dove salvare i CSV localmente nel progetto CML
OUT_DIR = "./artifacts"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
query = f"""
SELECT
  t.fiscalcodepa        AS pa,
  t.companyname         AS ragione_sociale,
  t.remittanceinformation AS remittance
FROM pagopa.silver_positive sp
LATERAL VIEW EXPLODE(sp.transferlist) t_view AS t
WHERE t.transfercategory LIKE '{TRANSFER_CAT_LIKE}'
  AND sp.paymentinfo.paymentdatetime >= CAST('{DATE_FROM}' AS TIMESTAMP)
  AND t.remittanceinformation IS NOT NULL
  AND t.remittanceinformation <> ''
"""

print("Eseguo query Spark…")
df = spark.sql(query).select(
    F.col("pa").cast("string"),
    F.col("ragione_sociale").cast("string"),
    F.col("remittance").cast("string")
)

# Campionamento e limite (per restare leggeri in CML)
if SAMPLE_FRACTION:
    df = df.sample(False, float(SAMPLE_FRACTION))
if MAX_ROWS and int(MAX_ROWS) > 0:
    df = df.limit(int(MAX_ROWS))

df = df.cache()
print("Conteggio (dopo sample/limit):", df.count())
df.show(SHOW_ROWS, truncate=False)


In [None]:
# --- a) Volumi per PA / ragione sociale (come la tua prima query)
volumi = (df.groupBy("pa", "ragione_sociale")
            .agg(F.count("*").alias("trx"))
            .orderBy(F.desc("trx")))

volumi.show(20, truncate=False)
volumi.write.mode("overwrite").option("header","true") \
    .csv(os.path.join(OUT_DIR, "volumi_per_pa_csv"))  # cartella con part file
print("Volumi per PA: salvati in", os.path.join(OUT_DIR, "volumi_per_pa_csv"))

# --- b) Rule-based categories (facile da leggere, veloce)
#     Usiamo regex in lower-case per intercettare varianti
rem = F.lower(F.col("remittance"))
categoria = (
    F.when(rem.rlike(r".*(rateizz|rata|rateal).*"), "Rata")
     .when(rem.rlike(r".*provvediment.*"),          "Provvedimento")
     .when(rem.rlike(r".*accertament.*"),           "Accertamento")
     .when(rem.rlike(r".*avvis.*"),                 "Avviso")
     .when(rem.rlike(r".*fattur.*"),                "Fattura")
     .otherwise("Altro")
)

cat_df = df.withColumn("categoria", categoria)
conteggi_cat = (cat_df.groupBy("categoria").count().orderBy(F.desc("count")))
conteggi_cat.show(truncate=False)

conteggi_cat.write.mode("overwrite").option("header","true") \
    .csv(os.path.join(OUT_DIR, "conteggi_categorie_csv"))
print("Conteggi categorie: salvati in", os.path.join(OUT_DIR, "conteggi_categorie_csv"))


In [None]:
# Prendiamo solo la colonna testuale
text_spark = df.select("remittance").dropna()

# Per sicurezza, prendiamo un ulteriore mini campione per Pandas (es. 30k righe max)
PANDAS_CAP = 30_000
text_spark = text_spark.limit(PANDAS_CAP)

print("Converto a Pandas…")
import pandas as pd
text_pd = text_spark.toPandas()  # Arrow disabilitato sopra per evitare errori
print("Righe Pandas:", len(text_pd))
text_pd.head()


In [None]:
import nltk, string
from nltk.corpus import stopwords
from nltk.stem.snowball import ItalianStemmer

# assicurati che questi pacchetti NLTK siano presenti (la prima volta servono i download)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

STOP_IT = set(stopwords.words('italian'))
STEMMER = ItalianStemmer()

punct_digits = re.compile(rf"[{re.escape(string.punctuation)}0-9]+")

def preprocess(s: str, do_stem: bool = True) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = punct_digits.sub(" ", s)
    tokens = [t for t in s.split() if len(t) > 2 and t not in STOP_IT]
    if do_stem:
        tokens = [STEMMER.stem(t) for t in tokens]
    return " ".join(tokens)

text_pd["processed"] = text_pd["remittance"].map(preprocess)
text_pd["processed"].head(10)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# TF-IDF
vectorizer = TfidfVectorizer(max_df=0.8, min_df=5)  # parametri prudenziali
X = vectorizer.fit_transform(text_pd["processed"])

k = 8  # scegli tu, oppure fai il "gomito" su un campione più piccolo
kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
labels = kmeans.fit_predict(X)
text_pd["cluster"] = labels

print("Distribuzione cluster:")
print(text_pd["cluster"].value_counts().sort_index())


In [None]:
import numpy as np

terms = np.array(vectorizer.get_feature_names_out())
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

TOP_N = 10
for i in range(k):
    print("\n" + "="*70)
    print(f"Cluster {i} | size={ (text_pd['cluster']==i).sum() }")
    print("Top terms:", ", ".join(terms[order_centroids[i, :TOP_N]]))
    # 3 esempi
    samples = text_pd.loc[text_pd["cluster"]==i, "remittance"].head(3).tolist()
    for s in samples:
        print("  -", s)


In [None]:
csv_path = os.path.join(OUT_DIR, "remittance_clusters_sample.csv")
text_pd[["remittance","processed","cluster"]].to_csv(csv_path, index=False)
print("Clustering (sample) salvato in:", csv_path)
