In [0]:
%sql
SELECT * FROM sc_gold.deals

In [0]:
%sql

DROP TABLE IF EXISTS sc_gold.deals_2;

CREATE TABLE sc_gold.deals_2 AS
SELECT conta_name,data_venda,modelos,data_decisao_negocio,tipo_cliente_negocio
FROM sc_gold.deals;

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window as W
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType, DecimalType
from pyspark.sql.functions import split

In [0]:
table_name = "sc_gold.deals_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna clientes)
table_name = "sc_gold.deals_2"
df0 = spark.table(table_name)


# Aplicar o filtro
df = df0.filter(
    (F.col("conta_name").isNotNull())
    & (F.trim(F.col("conta_name")) != "")
    & (F.lower(F.trim(F.col("conta_name"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

In [0]:
#Passo usado para remover coluna "data_decisao_negocio" 

#  Ler a tabela
df = spark.table("sc_gold.deals_2")

# Remover a coluna 'data_decisao_negocio'
df_clean = df.drop("data_decisao_negocio",'data_venda')

# (opcional) sobrescrever a tabela com o dataset limpo
(df_clean.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("sc_gold.deals_2"))

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna data de modelos -DEPOIS VER SE POSSO IR BUSCAR ESTA INFORMAÇÃO A ALGUM JOIN)
table_name = "sc_gold.deals_2"
df0 = spark.table(table_name)


# Aplicar o filtro
df = df0.filter(
    (F.col("modelos").isNotNull())
    & (F.trim(F.col("modelos")) != "")
    & (F.lower(F.trim(F.col("modelos"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

In [0]:
table_name = "sc_gold.deals_2"

# Ler tabela existente
df = spark.table(table_name)

# Filtrar apenas clientes de negócio do tipo 'Frota'
df = df.filter(df.tipo_cliente_negocio == 'Frota')

# Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(table_name)

In [0]:
table_name = "sc_gold.deals_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

Market Basquet Analysis

In [0]:
from pyspark.sql import functions as F

TABLE_SRC = "sc_gold.deals_2"  # origem
MIN_SUPPORT = 0.005             # 1% (0.01)
MIN_CONFIDENCE = 0.20          # 30% (0.30)

In [0]:
from pyspark.sql import functions as F 
from pyspark.sql import Window

# 1) Ler tabela
df = spark.table("sc_gold.deals_2").select("conta_name", "modelos")

# 2) Usar conta_name como transaction_id
df = df.withColumn("transaction_id", F.col("conta_name"))

# 3) Pivot para matriz binária (transaction_id x modelos)
df_flags = (
    df.groupBy("transaction_id")
      .pivot("modelos")
      .agg(F.lit(1))
      .fillna(0)
)

# 4) Calcular basket_size = número de modelos diferentes na transação
item_cols = [c for c in df_flags.columns if c != "transaction_id"]

df_basket = df_flags.withColumn(
    "basket_size",
    sum(F.col(c) for c in item_cols)
)

# 5) Distribuição de basket_size por número de transações + ORDER DESC
basket_dist = (
    df_basket
      .groupBy("basket_size")
      .agg(F.count("*").alias("num_transactions"))
      .orderBy(F.asc("basket_size"))
)

# 6) Adicionar percentagem de transações
window_all = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

basket_dist = (
    basket_dist
      .withColumn(
          "pct_transactions",
          F.col("num_transactions") / F.sum("num_transactions").over(window_all)
      )
)

# 7) Basket size médio (global)
basket_avg = (
    df_basket
      .agg(
          (F.sum("basket_size") / F.count("*")).alias("avg_basket_size")
      )
)

basket_dist.show()
basket_avg.show()




In [0]:
from pyspark.sql import functions as F

# 1) Ler tabela
df = spark.table("sc_gold.deals_2").select("conta_name", "modelos")

# 2) Usar conta_name como transaction_id
df = df.withColumn("transaction_id", F.col("conta_name"))

# 3) Pivot para matriz binária
df_flags = (
    df.groupBy("transaction_id")
      .pivot("modelos")
      .agg(F.lit(1))
      .fillna(0)
)

# 4) Calcular soma de modelos por transaction_id
cols_modelos = [c for c in df_flags.columns if c != "transaction_id"]

df_multi = df_flags.filter(
    sum(df_flags[c] for c in cols_modelos) >= 2
)

display(df_multi)
print("Total transações (contas) com 2+ modelos:", df_multi.count())
print("Total colunas:", len(df_multi.columns))


In [0]:
%pip install mlxtend


In [0]:
from pyspark.sql import functions as F
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# 1) Ler tabela e preparar transações = conta_name
df = spark.table("sc_gold.deals_2").select("conta_name", "modelos")

df = df.withColumn("transaction_id", F.col("conta_name"))

# 2) Pivot binário
df_flags = (
    df.groupBy("transaction_id")
      .pivot("modelos")
      .agg(F.lit(1))
      .fillna(0)
)

# 3) Filtrar contas com 2 ou mais modelos
cols_modelos = [c for c in df_flags.columns if c != "transaction_id"]

df_multi = df_flags.filter(
    sum(df_flags[c] for c in cols_modelos) >= 2
)

print("Contas com >=2 modelos:", df_multi.count())

# 4) Converter para Pandas para Apriori
pdf = df_multi.toPandas().set_index("transaction_id").astype(int)

# 5) Apriori
freq_itemsets = apriori(
    pdf,
    min_support=0.001,   # ajustável
    use_colnames=True
)
freq_itemsets["length"] = freq_itemsets["itemsets"].apply(len)

# 6) Preparar itemsets para Databricks display()
freq_itemsets_disp = freq_itemsets.copy()
freq_itemsets_disp["itemsets"] = freq_itemsets_disp["itemsets"].apply(list)

freq_itemsets_spark = spark.createDataFrame(freq_itemsets_disp)

print("Itemsets totais:", freq_itemsets.shape)
print("Itemsets com 2+ modelos:", freq_itemsets.query("length >= 2").shape)

display(
    freq_itemsets_spark.orderBy(F.desc("support")).limit(20)
)

# 7) Association Rules
rules = association_rules(
    freq_itemsets,
    metric="confidence",
    min_threshold=0.05
)

# Converter frozenset → lista para Arrow
rules_disp = rules.copy()
rules_disp["antecedents"] = rules_disp["antecedents"].apply(list)
rules_disp["consequents"] = rules_disp["consequents"].apply(list)

rules_spark = spark.createDataFrame(rules_disp)

print("Número de regras:", rules.shape[0])

display(
    rules_spark.orderBy(F.desc("lift")).limit(20)
)


In [0]:
from pyspark.sql import functions as F
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt

# 1) Ler tabela e preparar transações = conta_name
df = spark.table("sc_gold.deals_2").select("conta_name", "modelos")

df = df.withColumn("transaction_id", F.col("conta_name"))

# 2) Pivot binário (flags 0/1 por modelo)
df_flags = (
    df.groupBy("transaction_id")
      .pivot("modelos")
      .agg(F.lit(1))
      .fillna(0)
)

# 3) Filtrar apenas contas com 2 ou mais modelos
cols_modelos = [c for c in df_flags.columns if c != "transaction_id"]

df_multi = df_flags.filter(
    sum(df_flags[c] for c in cols_modelos) >= 2
)

print("Contas com >=2 modelos:", df_multi.count())

# 4) Converter para Pandas (matriz binária) para usar o Apriori
pdf = df_multi.toPandas().set_index("transaction_id").astype(int)

# 5) Apriori – conjuntos frequentes
freq_itemsets = apriori(
    pdf,
    min_support=0.05,   # ajusta se quiseres mais/menos regras
    use_colnames=True
)
freq_itemsets["length"] = freq_itemsets["itemsets"].apply(len)

print("Itemsets totais:", freq_itemsets.shape)

# 6) Regras de associação
rules = association_rules(
    freq_itemsets,
    metric="confidence",
    min_threshold=0.1    # ajusta se quiseres mais/menos regras
)

print("Nº total de regras:", rules.shape[0])

# 7) Filtrar só regras A->B (1 antecedente, 1 consequente)
rules_ab = rules[
    (rules["antecedents"].apply(len) == 1) &
    (rules["consequents"].apply(len) == 1)
].copy()

print("Nº de regras A->B:", rules_ab.shape[0])

# Extrair o nome do modelo (em vez de frozenset)
rules_ab["antecedent"] = rules_ab["antecedents"].apply(lambda x: list(x)[0])
rules_ab["consequent"] = rules_ab["consequents"].apply(lambda x: list(x)[0])

# 8) Preparar para display no Databricks (Spark DataFrame)
# converter frozenset -> list, para Arrow não rebentar
rules_ab_disp = rules_ab.copy()
rules_ab_disp["antecedents"] = rules_ab_disp["antecedents"].apply(list)
rules_ab_disp["consequents"] = rules_ab_disp["consequents"].apply(list)

rules_ab_spark = spark.createDataFrame(rules_ab_disp)

# ver top regras A->B ordenadas por lift
display(
    rules_ab_spark.orderBy(F.desc("lift")).limit(50)
)

# 9) HEATMAP do lift das regras A->B (em pandas + matplotlib)
if not rules_ab.empty:
    # matriz antecedent x consequent com lift
    lift_matrix = (
        rules_ab[["antecedent", "consequent", "lift"]]
        .pivot(index="antecedent", columns="consequent", values="lift")
        .fillna(1.0)   # onde não há regra, definimos lift = 1 (neutro)
    )

    plt.figure(figsize=(10, 8))
    im = plt.imshow(lift_matrix.values, aspect="auto")
    plt.colorbar(im, label="Lift")

    plt.xticks(
        range(len(lift_matrix.columns)),
        lift_matrix.columns,
        rotation=90
    )
    plt.yticks(
        range(len(lift_matrix.index)),
        lift_matrix.index
    )
    plt.title("Heatmap do Lift das Regras A→B")
    plt.xlabel("Consequente (B)")
    plt.ylabel("Antecedente (A)")
    plt.tight_layout()
    display(plt.gcf())
else:
    print("Não existem regras A->B para gerar heatmap.")


BLOCO SOMENTE PARA REGRAS B→A + HEATMAP

In [0]:
# --- Regras B→A (1 item → 1 item), invertendo as regras ---

rules_ba = rules.copy()

# Inverter antecedente e consequente
rules_ba["antecedents"], rules_ba["consequents"] = (
    rules_ba["consequents"],
    rules_ba["antecedents"],
)

# Filtrar só regras 1→1
rules_ba = rules_ba[
    (rules_ba["antecedents"].apply(len) == 1) &
    (rules_ba["consequents"].apply(len) == 1)
].copy()

print("Nº de regras B→A:", rules_ba.shape[0])

# Extrair strings
rules_ba["antecedent"]  = rules_ba["antecedents"].apply(lambda x: list(x)[0])
rules_ba["consequent"]  = rules_ba["consequents"].apply(lambda x: list(x)[0])

# Preparar para display no Databricks (evitar erro Arrow)
rules_ba_disp = rules_ba.copy()
rules_ba_disp["antecedents"] = rules_ba_disp["antecedents"].apply(list)
rules_ba_disp["consequents"] = rules_ba_disp["consequents"].apply(list)

rules_ba_spark = spark.createDataFrame(rules_ba_disp)

# Mostrar top regras B→A
display(
    rules_ba_spark.orderBy(F.desc("lift")).limit(50)
)

# --- HEATMAP B→A ---

if not rules_ba.empty:
    lift_matrix_ba = (
        rules_ba[["antecedent", "consequent", "lift"]]
        .pivot(index="antecedent", columns="consequent", values="lift")
        .fillna(1.0)
    )

    plt.figure(figsize=(10, 8))
    im = plt.imshow(lift_matrix_ba.values, aspect="auto")
    plt.colorbar(im, label="Lift (B→A)")

    plt.xticks(range(len(lift_matrix_ba.columns)), lift_matrix_ba.columns, rotation=90)
    plt.yticks(range(len(lift_matrix_ba.index)), lift_matrix_ba.index)

    plt.title("Heatmap do Lift das Regras B→A")
    plt.xlabel("Consequente (A)")
    plt.ylabel("Antecedente (B)")
    plt.tight_layout()
    display(plt.gcf())
else:
    print("Não existem regras B→A.")


{A, B} → C

In [0]:
# --- APRIORI com thresholds ajustados para A-B → C ---

freq_itemsets = apriori(
    pdf,
    min_support=0.015,     # <<< MUITO MAIS BAIXO
    use_colnames=True
)

freq_itemsets["length"] = freq_itemsets["itemsets"].apply(len)

print("Itemsets com 2+ modelos:", freq_itemsets.query("length >= 2").shape)
print("Itemsets com 3+ modelos:", freq_itemsets.query("length >= 3").shape)

rules = association_rules(
    freq_itemsets,
    metric="confidence",
    min_threshold=0.1      # <<< confiança mais baixa
)

# filtrar regras A-B → C
rules_abc = rules[
    (rules["antecedents"].apply(len) == 2) &
    (rules["consequents"].apply(len) == 1)
]
print("Regras A–B → C:", rules_abc.shape[0])

In [0]:
# --- Regras A–B → C (antecedent length = 2, consequent length = 1) ---

rules_abc = rules[
    (rules["antecedents"].apply(len) == 2) &
    (rules["consequents"].apply(len) == 1)
].copy()

print("Nº de regras {A, B} → C:", rules_abc.shape[0])

# Extrair listas legíveis
rules_abc["antecedent"]  = rules_abc["antecedents"].apply(lambda x: list(x))
rules_abc["consequent"]  = rules_abc["consequents"].apply(lambda x: list(x)[0])

# Preparar para display no Databricks (evitar erro Arrow)
rules_abc_disp = rules_abc.copy()
rules_abc_disp["antecedents"] = rules_abc_disp["antecedents"].apply(list)
rules_abc_disp["consequents"] = rules_abc_disp["consequents"].apply(list)

rules_abc_spark = spark.createDataFrame(rules_abc_disp)

# Mostrar top regras ordenadas por lift
display(
    rules_abc_spark.orderBy(F.desc("lift")).limit(50)
)

# --- HEATMAP {A,B} → C ---

if not rules_abc.empty:
    # Criar chave do antecedente: "A + B"
    rules_abc["antecedent_pair"] = rules_abc["antecedent"].apply(lambda x: " + ".join(sorted(x)))

    lift_matrix_abc = (
        rules_abc[["antecedent_pair", "consequent", "lift"]]
        .pivot(index="antecedent_pair", columns="consequent", values="lift")
        .fillna(1.0)
    )

    plt.figure(figsize=(12, 8))
    im = plt.imshow(lift_matrix_abc.values, aspect="auto")
    plt.colorbar(im, label="Lift ( {A,B} → C )")

    plt.xticks(range(len(lift_matrix_abc.columns)), lift_matrix_abc.columns, rotation=90)
    plt.yticks(range(len(lift_matrix_abc.index)), lift_matrix_abc.index)

    plt.title("Heatmap do Lift das Regras {A, B} → C")
    plt.xlabel("Consequente (C)")
    plt.ylabel("Antecedente (A + B)")
    plt.tight_layout()
    display(plt.gcf())
else:
    print("Não existem regras do tipo A–B → C.")


In [0]:
# ============================================================
# Basket (flags binárias, transaction = conta_name)
# + Resumo + Pares com Lift (Spark)
# ============================================================
from pyspark.sql import functions as F
from pyspark.sql import Window
import matplotlib.pyplot as plt
import pandas as pd

# -----------------------------
# 0) Fonte de dados e basket 0/1
# -----------------------------
df = spark.table("sc_gold.deals_2").select("conta_name", "modelos")

# ID de transação = conta_name
df = df.withColumn("transaction_id", F.col("conta_name"))

# Basket binário (wide)
df_flags = (
    df.groupBy("transaction_id")
      .pivot("modelos")
      .agg(F.lit(1))
      .fillna(0)
)

# calcular tamanho da cesta (nº de modelos por conta)
item_cols = [c for c in df_flags.columns if c != "transaction_id"]

sum_expr = None
for c in item_cols:
    sum_expr = (F.col(c) if sum_expr is None else sum_expr + F.col(c))

df_basket_size = df_flags.select("transaction_id", sum_expr.alias("basket_size"))

# manter apenas contas com 2+ modelos
df_flags = (
    df_flags.join(df_basket_size, "transaction_id")
            .filter(F.col("basket_size") >= 2)
            .drop("basket_size")
)

display(df_flags.limit(10))
print("Total de colunas (inclui transaction_id):", len(df_flags.columns))

# -----------------------------
# 1) Resumo descritivo (itens)
# -----------------------------
n_trans = df_flags.count()
item_cols = [c for c in df_flags.columns if c != "transaction_id"]
n_items = len(item_cols)

# Suporte individual: sum(col)/n_trans
agg_exprs = [F.sum(F.col(c)).alias(c) for c in item_cols]
freq_pdf = (
    df_flags.agg(*agg_exprs)
            .toPandas()
            .T.reset_index()
            .rename(columns={"index":"item", 0:"count"})
)
freq_pdf["support"] = freq_pdf["count"] / n_trans
freq_pdf = freq_pdf.sort_values("support", ascending=False)

# Métricas para o slide
top10_pdf = freq_pdf.head(10).copy()
cov_top10 = top10_pdf["count"].sum() / n_trans if n_trans > 0 else 0.0
sup_max = freq_pdf["support"].max() if not freq_pdf.empty else 0.0
pct_long_tail_lt1 = (freq_pdf["support"] < 0.01).mean() if not freq_pdf.empty else 0.0

# Tamanho de cesta (itens por transação) – já filtrado para >=2
df_basket_size = df_flags.select(
    "transaction_id",
    sum(F.col(c) for c in item_cols).alias("basket_size")
)

basket_stats = df_basket_size.groupBy("basket_size").count().orderBy("basket_size")
display(basket_stats)

avg_basket = df_basket_size.agg(F.avg("basket_size")).first()[0] if n_trans > 0 else 0.0
pct_ge2 = 1.0  # por definição agora todas têm >=2 itens

print("\n===== RESUMO (para slide) =====")
print(f"Transações (contas com 2+ modelos): {n_trans:,}")
print(f"Itens (modelos) distintos: {n_items}")
print(f"Suporte do item mais frequente: {sup_max:.2%}")
print(f"Cobertura do Top 10 itens: {cov_top10:.2%}")
print(f"Média de itens por transação: {avg_basket:.2f}")
print(f"% de transações com 2+ itens: {pct_ge2:.2%}")
print(f"% de itens com suporte < 1% (long tail): {pct_long_tail_lt1:.2%}")

# -----------------------------
# 2) Gráfico: Top 15 itens
# -----------------------------
if not freq_pdf.empty:
    top15 = freq_pdf.head(15).copy()
    plt.figure(figsize=(10,5))
    plt.barh(top15["item"], top15["support"])
    plt.gca().invert_yaxis()
    plt.xlabel("Support (proporção das transações)")
    plt.title("Top 15 modelos mais frequentes (contas com 2+ modelos)")
    plt.tight_layout()
    plt.show()
else:
    print("Sem itens para plotar (freq_pdf vazio).")

# -----------------------------
# 3) PARES (A,B): suporte, confiança e LIFT
#    - reconstrói lista de itens por transação (a partir de df_flags)
#    - gera combinações (A,B) com i<j
# -----------------------------

# 3.1) Long format: (transaction_id, item) somente onde flag=1
stack_expr = "stack({n}, {pairs}) as (flag, item)".format(
    n=len(item_cols),
    pairs=", ".join([f"`{c}`, '{c}'" for c in item_cols])
)

long_flags = (
    df_flags.select("transaction_id", F.expr(stack_expr))
            .where(F.col("flag") == 1)
            .select("transaction_id", "item")
)

# 3.2) Itens por transação (array)
tx_items = long_flags.groupBy("transaction_id").agg(F.collect_set("item").alias("items"))

# 3.3) Gerar pares (A,B) por transação (i<j)
a = tx_items.select("transaction_id", F.posexplode("items").alias("i","A"))
b = tx_items.select("transaction_id", F.posexplode("items").alias("j","B"))
pairs = (
    a.join(b, "transaction_id")
     .where(F.col("i") < F.col("j"))
     .select(F.array_sort(F.array("A","B")).alias("AB"))
)

pairs = pairs.select(
    F.col("AB")[0].alias("A"),
    F.col("AB")[1].alias("B")
)

# 3.4) Suportes 1-item e 2-itens
support_1 = (
    long_flags.groupBy("item")
              .agg(F.countDistinct("transaction_id").alias("cnt"))
              .withColumn("support", F.col("cnt")/F.lit(n_trans))
              .withColumnRenamed("item","X")
)

support_2 = (
    pairs.groupBy("A","B")
         .agg(F.count("*").alias("cnt"))
         .withColumn("support_ab", F.col("cnt")/F.lit(n_trans))
)

# 3.5) Juntar sup(A), sup(B) e calcular métricas
rules_pairs = (
    support_2
      .join(support_1.withColumnRenamed("X","A").withColumnRenamed("support","sup_a"), on="A", how="left")
      .join(support_1.withColumnRenamed("X","B").withColumnRenamed("support","sup_b"), on="B", how="left")
      .withColumn("confidence_a_to_b", F.col("support_ab")/F.col("sup_a"))
      .withColumn("confidence_b_to_a", F.col("support_ab")/F.col("sup_b"))
      .withColumn("lift", F.col("support_ab")/(F.col("sup_a")*F.col("sup_b")))
      .orderBy(F.desc("lift"), F.desc("support_ab"))
)

display(rules_pairs.limit(50))

# -----------------------------
# 4) Heatmap de LIFT dos pares mais relevantes
# -----------------------------
rules_pairs_pdf = rules_pairs.toPandas()
if not rules_pairs_pdf.empty:
    # Selecionar TOP A e B por maior lift máximo
    top_a = (rules_pairs_pdf.groupby("A")["lift"].max().sort_values(ascending=False).head(15)).index.tolist()
    top_b = (rules_pairs_pdf.groupby("B")["lift"].max().sort_values(ascending=False).head(15)).index.tolist()

    mat = rules_pairs_pdf[
        rules_pairs_pdf["A"].isin(top_a) & rules_pairs_pdf["B"].isin(top_b)
    ].pivot_table(index="A", columns="B", values="lift", fill_value=0)

    if not mat.empty:
        plt.figure(figsize=(14,8))
        plt.imshow(mat.values, aspect='auto')
        plt.xticks(range(len(mat.columns)), mat.columns, rotation=90)
        plt.yticks(range(len(mat.index)), mat.index)
        plt.title("Lift — Pares (A,B) | transaction = conta_name, cestas com 2+ modelos")
        plt.colorbar(label="Lift")
        plt.tight_layout()
        plt.show()
    else:
        print("Sem matriz suficiente para o heatmap de pares (filtro TOP vazio).")
else:
    print("Regras de pares vazias; sem heatmap.")

# -----------------------------
# 5) Tabela final para slide: TOP pares (filtros úteis)
# -----------------------------
MIN_SUP_AB   = 0.01   # ≥1% das transações (ajuste conforme volume)
MIN_CONF     = 0.20   # ≥20% confiança
MIN_LIFT     = 1.2    # lift acima de 1 indica associação positiva

rules_pairs_nice = (
    rules_pairs
      .where( (F.col("support_ab")>=MIN_SUP_AB) &
              ((F.col("confidence_a_to_b")>=MIN_CONF) | (F.col("confidence_b_to_a")>=MIN_CONF)) &
              (F.col("lift")>=MIN_LIFT) )
      .select(
          "A","B",
          (F.col("support_ab")).alias("support_ab"),
          F.round(F.col("confidence_a_to_b"),4).alias("conf_A_to_B"),
          F.round(F.col("confidence_b_to_a"),4).alias("conf_B_to_A"),
          F.round(F.col("lift"),4).alias("lift")
      )
      .orderBy(F.desc("lift"), F.desc("support_ab"))
)

display(rules_pairs_nice.limit(50))

# -----------------------------
# 6) (Opcional) Exportar tabelas ao catálogo
# -----------------------------
spark.sql("CREATE SCHEMA IF NOT EXISTS sc_gold")
rules_pairs.write.mode("overwrite").saveAsTable("sc_gold.mba_pairs_rules_conta")
df_flags.write.mode("overwrite").saveAsTable("sc_gold.mba_basket_flags_conta")

print("\nPronto! Resumo, gráficos e pares com lift (transaction = conta_name, apenas contas com 2+ modelos) gerados.")


P-Growth / Apriori / MBA	array<string> (lista de itens)	É o formato nativo esperado pelos algoritmos de cesta
Binary flags (One-Hot Encoding)	1 coluna por item (0/1)	Só é útil se fores usar modelos tipo regressão, clustering, ML clássico

Ou seja:

Para descobrir regras de associação → listas são o formato correto

Para fazer modelos supervisionados / perfis → binary flags é o formato correto

Por isso não transformei para wide format (0/1) — só atrapalha FP-Growth e explode o número de colunas.

In [0]:
#Confirmar estrutura mínima e amostra
df = spark.table(TABLE_SRC).select("conta_name", "data_venda", "modelos")
display(df.limit(20))  # equivalente a SELECT * ... LIMIT 20

# (opcional) verificar se há transações com >=2 modelos no mesmo (conta_name, data_venda)
check = (
    df.where(F.col("modelos").isNotNull())
      .groupBy("conta_name","data_venda")
      .agg(F.countDistinct("modelos").alias("n_modelos"))
      .orderBy(F.desc("n_modelos"))
)
display(check.limit(10))

In [0]:
#criar identificador da transaçao
#Criar o identificador da transação (temp view: deals_tx)
deals_tx = (
    df.where(F.col("modelos").isNotNull())
      .withColumn("modelos", F.trim("modelos"))
      .select(
          "conta_name",
          "data_venda",
          "modelos",
          F.concat_ws("_", F.col("conta_name"), F.date_format(F.col("data_venda"), "yyyyMMdd")).alias("transaction_id")
      )
)

deals_tx.createOrReplaceTempView("deals_tx")  # VIEW TEMPORÁRIA
display(deals_tx.limit(10))




In [0]:
#lista de itens por transação
#Itens por transação (temp view: tx_grouped)

tx_grouped = (
    deals_tx
      .dropDuplicates(["transaction_id", "modelos"])               # 1 ocorrência por transação+modelo
      .groupBy("transaction_id")
      .agg(F.collect_set("modelos").alias("models_list"))
      .where(F.size(F.col("models_list")) > 1)                     # HAVING size(...) > 1
)

tx_grouped.createOrReplaceTempView("tx_grouped")
print("Nº transações com 2+ modelos:", tx_grouped.count())
display(tx_grouped.limit(10))

In [0]:
#Explodir pares (temp view: tx_pairs)
a = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("i", "model_a"))
b = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("j", "model_b"))

tx_pairs = (
    a.join(b, on="transaction_id", how="inner")
     .where(F.col("i") < F.col("j"))                  # evita duplicados e reflexos
     .select("model_a", "model_b")
)

tx_pairs.createOrReplaceTempView("tx_pairs")
display(tx_pairs.limit(20))


In [0]:
#Suportes (temp views: support_1 e support_2)

# denominador = nº de transações (com 2+ modelos)
N_tx = tx_grouped.count()
print("N_tx (denominador do suporte) =", N_tx)

support_1 = (
    tx_pairs
      .groupBy(F.col("model_a").alias("model"))
      .agg(F.count("*").alias("cnt"))
      .withColumn("support", F.col("cnt") / F.lit(N_tx))
)
support_1.createOrReplaceTempView("support_1")
display(support_1.orderBy(F.desc("support")).limit(20))

support_2 = (
    tx_pairs
      .groupBy("model_a","model_b")
      .agg(F.count("*").alias("cnt"))
      .withColumn("support", F.col("cnt") / F.lit(N_tx))
)
support_2.createOrReplaceTempView("support_2")
display(support_2.orderBy(F.desc("support")).limit(20))


In [0]:
#Regras A→B (confidence, lift) e gravação da tabela final

rules_df = (
    support_2.alias("s2")
      .join(support_1.alias("s1"), F.col("s1.model") == F.col("s2.model_a"), "inner")
      .join(support_1.alias("s3"), F.col("s3.model") == F.col("s2.model_b"), "inner")
      .select(
          F.col("s2.model_a").alias("antecedent"),
          F.col("s2.model_b").alias("consequent"),
          F.col("s2.support").alias("support_ab"),
          F.col("s1.support").alias("support_a"),
          F.col("s3.support").alias("support_b"),
          (F.col("s2.support") / F.col("s1.support")).alias("confidence"),
          ((F.col("s2.support") / F.col("s1.support")) / F.col("s3.support")).alias("lift")
      )
      .where(
          (F.col("support_ab") >= F.lit(MIN_SUPPORT)) &
          (F.col("confidence") >= F.lit(MIN_CONFIDENCE)) &
          (F.col("lift") >= F.lit(1.0))
      )
      .orderBy(F.desc("confidence"), F.desc("lift"), F.desc("support_ab"))
)

# Tabela permanente com as regras
rules_df.write.format("delta").mode("overwrite").saveAsTable("sc_gold.mba_modelos_rules")

display(spark.table("sc_gold.mba_modelos_rules").limit(50))


In [0]:

#Ver rapidamente as top regras
display(spark.table("sc_gold.mba_modelos_rules").orderBy(F.desc("lift")).limit(50))


In [0]:
from pyspark.sql import functions as F

TOP_K = 20   # ajusta conforme precisares

# Top-K modelos por frequência
top_models = (
    support_1.orderBy(F.desc("cnt"))
             .limit(TOP_K)
             .select("model")
             .toPandas()["model"].tolist()
)

# Pares apenas entre os Top-K
pairs_top = (
    support_2
      .where(F.col("model_a").isin(top_models) & F.col("model_b").isin(top_models))
      .select("model_a","model_b","cnt")
)

# Passar para matriz (pandas) e desenhar heatmap com matplotlib
pdf_pairs = pairs_top.toPandas()

# Tornar a matriz simétrica (a,b) e (b,a)
import pandas as pd
symm = pd.concat([
    pdf_pairs.rename(columns={"model_a":"row","model_b":"col","cnt":"value"}),
    pdf_pairs.rename(columns={"model_b":"row","model_a":"col","cnt":"value"})
], ignore_index=True)

# Adicionar diagonal com contagens individuais
pdf_1 = (support_1
         .where(F.col("model").isin(top_models))
         .select("model","cnt").toPandas())
diag = pd.DataFrame({"row": pdf_1["model"], "col": pdf_1["model"], "value": pdf_1["cnt"]})

mat = pd.concat([symm, diag], ignore_index=True)
pivot = mat.pivot_table(index="row", columns="col", values="value", fill_value=0)

# Plot
import matplotlib.pyplot as plt

plt.figure(figsize=(10,8))
plt.imshow(pivot.values)
plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=90)
plt.yticks(range(len(pivot.index)), pivot.index)
plt.title("Co-ocorrência de modelos (Top-{})".format(TOP_K))
plt.colorbar(label="# pares na mesma transação")
plt.tight_layout()
plt.show()


In [0]:
#Regras B→A (reversas) e união
#  thresholds (podes afinar)
MIN_SUPPORT = 0.01
MIN_CONFIDENCE = 0.30

one_sup_x = support_1.select(F.col("model").alias("x"), F.col("support").alias("sup_x"))
one_sup_y = support_1.select(F.col("model").alias("y"), F.col("support").alias("sup_y"))

# A->B (se já não tens)
rules_ab = (
    support_2.alias("s2")
      .join(one_sup_x.alias("s1"), F.col("s1.x")==F.col("s2.model_a"))
      .join(one_sup_y.alias("s3"), F.col("s3.y")==F.col("s2.model_b"))
      .select(
          F.col("s2.model_a").alias("antecedent"),
          F.col("s2.model_b").alias("consequent"),
          F.col("s2.support").alias("support_ab"),
          F.col("s1.sup_x").alias("support_a"),
          F.col("s3.sup_y").alias("support_b"),
          (F.col("s2.support")/F.col("s1.sup_x")).alias("confidence"),
          ((F.col("s2.support")/F.col("s1.sup_x"))/F.col("s3.sup_y")).alias("lift")
      )
      .where( (F.col("support_ab")>=MIN_SUPPORT) & (F.col("confidence")>=MIN_CONFIDENCE) & (F.col("lift")>=1.0) )
)

# B->A (reversas)
rules_ba = (
    support_2.alias("s2")
      .join(one_sup_x.alias("s1"), F.col("s1.x")==F.col("s2.model_b"))  # agora sup(B)
      .join(one_sup_y.alias("s3"), F.col("s3.y")==F.col("s2.model_a"))  # e sup(A)
      .select(
          F.col("s2.model_b").alias("antecedent"),
          F.col("s2.model_a").alias("consequent"),
          F.col("s2.support").alias("support_ab"),
          F.col("s1.sup_x").alias("support_a"),
          F.col("s3.sup_y").alias("support_b"),
          (F.col("s2.support")/F.col("s1.sup_x")).alias("confidence"),
          ((F.col("s2.support")/F.col("s1.sup_x"))/F.col("s3.sup_y")).alias("lift")
      )
      .where( (F.col("support_ab")>=MIN_SUPPORT) & (F.col("confidence")>=MIN_CONFIDENCE) & (F.col("lift")>=1.0) )
)

rules_all = rules_ab.unionByName(rules_ba).orderBy(F.desc("confidence"), F.desc("lift"), F.desc("support_ab"))
display(rules_all.limit(100))

# (opcional) gravar
#rules_all.write.format("delta").mode("overwrite").saveAsTable("sc_gold.mba_modelos_rules_ba")


In [0]:
#Regras 2→1 (A,B→C)
#Aqui calculamos triplos (A,B,C) por transação com posexplode 3x e condição i<j<k, depois tiramos confidence e lift de (A,B)→C.

# 3.1 gerar triplos distintos por transação
a = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("i","A"))
b = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("j","B"))
c = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("k","C"))

triples = (
    a.join(b, "transaction_id").join(c, "transaction_id")
     .where((F.col("i") < F.col("j")) & (F.col("j") < F.col("k")))  # i<j<k
     .select("A","B","C")
)

# 3.2 suporte de triplos
N_tx = tx_grouped.count()
support_3 = (
    triples.groupBy("A","B","C")
           .agg(F.count("*").alias("cnt"))
           .withColumn("support_abc", F.col("cnt")/F.lit(N_tx))
)

# 3.3 precisamos do sup(A,B) e sup(C)
pairs_sup = support_2.select(F.col("model_a").alias("A"), F.col("model_b").alias("B"), F.col("support").alias("sup_ab"))
one_sup  = support_1.select(F.col("model").alias("C"), F.col("support").alias("sup_c"))

# 3.4 regras (A,B) -> C
MIN_SUPPORT_ABC = 0.005   # 0.5% (ajusta)
MIN_CONF_2TO1   = 0.20    # 20%

rules_2to1 = (
    support_3
      .join(pairs_sup, on=["A","B"], how="inner")
      .join(one_sup, on=["C"], how="inner")
      .select(
          F.array_sort(F.array("A","B")).alias("antecedent_AB"),
          F.col("C").alias("consequent"),
          F.col("support_abc"),
          F.col("sup_ab"),
          F.col("sup_c"),
          (F.col("support_abc")/F.col("sup_ab")).alias("confidence"),
          ((F.col("support_abc")/F.col("sup_ab"))/F.col("sup_c")).alias("lift")
      )
      .where(
          (F.col("support_abc") >= F.lit(MIN_SUPPORT_ABC)) &
          (F.col("confidence")  >= F.lit(MIN_CONF_2TO1)) &
          (F.col("lift")        >= F.lit(1.0))
      )
      .orderBy(F.desc("confidence"), F.desc("lift"), F.desc("support_abc"))
)

display(rules_2to1.limit(100))

# (opcional) gravar
rules_2to1.write.format("delta").mode("overwrite").saveAsTable("sc_gold.mba_modelos_rules_2to1")


#grafico
mba_modelos_rules_2to1 = spark.table("sc_gold.mba_modelos_rules_2to1")
pdf2 = mba_modelos_rules_2to1.toPandas()
if not pdf2.empty:
    pdf2["AB"] = pdf2["antecedent_AB"].apply(lambda xs: " + ".join(xs))
    top_ab = (pdf2.groupby("AB")["lift"].max().sort_values(ascending=False).head(15)).index.tolist()
    top_c  = (pdf2.groupby("consequent")["lift"].max().sort_values(ascending=False).head(15)).index.tolist()
    mat2 = pdf2[pdf2["AB"].isin(top_ab) & pdf2["consequent"].isin(top_c)] \
              .pivot_table(index="AB", columns="consequent", values="lift", fill_value=0)

    import matplotlib.pyplot as plt
    plt.figure(figsize=(14,8))
    plt.imshow(mat2.values)
    plt.xticks(range(len(mat2.columns)), mat2.columns, rotation=90)
    plt.yticks(range(len(mat2.index)), mat2.index)
    plt.title("Lift — Regras 2→1 (A,B→C)")
    plt.colorbar(label="Lift")
    plt.tight_layout()
    plt.show()


In [0]:
from pyspark.sql import functions as F

# ====== Configurações ======
TABLE_RULES = "sc_gold.mba_modelos_rules"
TOP_N = 15  # quantas regras mostrar
# Opcional: foca em certos modelos (case-insensitive). Deixa lista vazia [] se quiseres todas.
model_keywords = ["Ioniq", "Tucson", "Kauai", "I20", "I30", "Bayon", "Santa Fe"]

# ====== Ler regras e (opcional) filtrar por modelos de interesse ======
rules = spark.table(TABLE_RULES).select(
    "antecedent","consequent","support_ab","support_a","support_b","confidence","lift"
)

if model_keywords:
    # cria padrão regex OR, case-insensitive
    pattern = "|".join([f"(?i){kw}" for kw in model_keywords])
    rules = rules.where(
        F.col("antecedent").rlike(pattern) | F.col("consequent").rlike(pattern)
    )

# Top-N por lift (desempata por confidence e support)
top_rules = (rules
             .orderBy(F.desc("lift"), F.desc("confidence"), F.desc("support_ab"))
             .limit(TOP_N))

# ====== Converter para pandas e criar label para o gráfico ======
pdf = top_rules.toPandas().fillna(0)
if pdf.empty:
    raise ValueError("Sem regras para plotar com os filtros atuais. Ajusta TOP_N ou 'model_keywords'.")

pdf["rule_label"] = pdf["antecedent"] + "  →  " + pdf["consequent"]

# ====== Plot horizontal por LIFT ======
import matplotlib.pyplot as plt

plt.figure(figsize=(11, 6))
plt.barh(pdf["rule_label"], pdf["lift"])
plt.xlabel("Lift (força da associação)")
plt.title(f"Top {len(pdf)} Regras Modelo → Modelo (ordenado por Lift)")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [0]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))
plt.scatter(pdf["support_ab"], pdf["confidence"], s=40 + 120*(pdf["lift"]-pdf["lift"].min())/(pdf["lift"].max()-pdf["lift"].min()+1e-9))
for _, r in pdf.iterrows():
    plt.annotate(f"{r['antecedent']}→{r['consequent']}", (r["support_ab"], r["confidence"]), fontsize=8, xytext=(3,2), textcoords="offset points")
plt.xlabel("Support AB")
plt.ylabel("Confidence")
plt.title("Regras Modelo→Modelo (tamanho ~ lift)")
plt.tight_layout()
plt.show()


In [0]:
from pyspark.sql import functions as F, types as T
from itertools import combinations
from functools import reduce
import pandas as pd

# =========================
# Parâmetros
# =========================
SRC_GROUPED = "sc_gold.deals_tx_grouped"   # transaction_id, models_list
SRC_RAW     = "sc_gold.deals_2"            # se a grouped não existir
MIN_SUPPORT = 0.005                        # 0.5%
MIN_CONF    = 0.20                         # 20%
MAX_K       = 3                            # itemsets até tamanho 3 no fallback

# =========================
# Obter/Construir tx_grouped
# =========================
if spark.catalog.tableExists(SRC_GROUPED):
    tx_grouped = spark.table(SRC_GROUPED)
else:
    df = (spark.table(SRC_RAW)
          .select("conta_name","data_venda","modelos")
          .where(F.col("modelos").isNotNull()))
    deals_tx = (
        df.withColumn("modelos", F.trim("modelos"))
          .select(
              "conta_name","data_venda","modelos",
              F.concat_ws("_", F.col("conta_name"), F.date_format(F.col("data_venda"), "yyyyMMdd")).alias("transaction_id")
          )
          .dropDuplicates(["transaction_id","modelos"])
    )
    tx_grouped = (
        deals_tx.groupBy("transaction_id")
                .agg(F.collect_set("modelos").alias("models_list"))
                .where(F.size("models_list") > 1)
    )

N = tx_grouped.count()
print("N transações (2+ itens):", N)

# =========================
# 1) Tentar FP-Growth nativo (se for suportado)
# =========================
fp_itemsets = None
fp_rules_pretty = None

try:
    from pyspark.ml.fpm import FPGrowth
    fpg = FPGrowth(itemsCol="models_list", minSupport=MIN_SUPPORT, minConfidence=MIN_CONF)
    model = fpg.fit(tx_grouped)

    freq_itemsets = model.freqItemsets  # items (array<string>), freq
    fp_itemsets = freq_itemsets.withColumn("support", F.col("freq")/F.lit(N))

    rules = model.associationRules       # antecedent (array), consequent (array), confidence, lift, [support?]
    if "support" in rules.columns:
        rules = rules.withColumnRenamed("support","support_ab")
    else:
        sup_ab = (fp_itemsets
                  .select(F.array_sort("items").alias("ab"), F.col("support").alias("support_ab")))
        rules = (rules
                 .withColumn("ab", F.array_sort(F.array_union(F.col("antecedent"), F.col("consequent"))))
                 .join(sup_ab, "ab", "left")
                 .drop("ab"))

    join_str = F.udf(lambda xs: ", ".join(xs) if xs else "", T.StringType())
    fp_rules_pretty = (rules
                       .withColumn("antecedent_str", join_str("antecedent"))
                       .withColumn("consequent_str", join_str("consequent"))
                       .select("antecedent_str","consequent_str","support_ab","confidence","lift"))
    print("✅ FP-Growth MLlib concluído.")

except Exception as e:
    print("⚠️ FP-Growth MLlib indisponível — a usar fallback serverless-safe.")
    # =========================
    # 2) FALLBACK: frequent itemsets & rules via DataFrame + mapInPandas
    # =========================

    # --- 2.1 Suporte de 1-item ---
    one_items = (
        tx_grouped
          .select(F.explode("models_list").alias("item"))
          .groupBy("item").agg(F.count("*").alias("cnt"))
          .withColumn("support", F.col("cnt") / F.lit(N))
    )
    # materializar em Delta para evitar recomputes em Serverless
    one_items.write.format("delta").mode("overwrite").saveAsTable("sc_gold.tmp_one_items")
    one_items = spark.table("sc_gold.tmp_one_items")

    # --- 2.2 Suporte de k-itens (k>=2) via mapInPandas ---
    def combos_gen(k: int):
        schema = T.StructType([T.StructField(f"i{t+1}", T.StringType(), True) for t in range(k)])
        def _fn(iterator):
            for pdf in iterator:
                rows = []
                for items in pdf["models_list"]:
                    if not isinstance(items, (list, tuple)):
                        continue
                    L = sorted(set(str(x) for x in items if x is not None))
                    for combo in combinations(L, k):
                        rows.append(combo)
                if rows:
                    yield pd.DataFrame(rows, columns=[f"i{t+1}" for t in range(k)])
                else:
                    yield pd.DataFrame(columns=[f"i{t+1}" for t in range(k)])
        return _fn, schema

    supports = []

    # k=2
    fn2, sch2 = combos_gen(2)
    sup2 = (tx_grouped.select("models_list")
            .mapInPandas(fn2, schema=sch2)
            .groupBy("i1","i2").agg(F.count("*").alias("cnt"))
            .withColumn("support", F.col("cnt")/F.lit(N))
            .withColumn("items", F.array("i1","i2"))
            .select("items","cnt","support"))
    supports.append(sup2)

    # k=3 (se MAX_K >= 3)
    if MAX_K >= 3:
        fn3, sch3 = combos_gen(3)
        sup3 = (tx_grouped.select("models_list")
                .mapInPandas(fn3, schema=sch3)
                .groupBy("i1","i2","i3").agg(F.count("*").alias("cnt"))
                .withColumn("support", F.col("cnt")/F.lit(N))
                .withColumn("items", F.array("i1","i2","i3"))
                .select("items","cnt","support"))
        supports.append(sup3)

    # itemsets frequentes unificados (1,2[,3]-itens)
    fp_itemsets = (
        one_items.select(F.array(F.col("item")).alias("items"), "cnt", "support")
        .unionByName(reduce(lambda a,b: a.unionByName(b), supports))
    )

    # --- 2.3 Regras 1→1 (A→B) ---
    pairs = sup2.select(F.col("items")[0].alias("a"),
                        F.col("items")[1].alias("b"),
                        F.col("support").alias("support_ab"))

    s1  = one_items.select(F.col("item").alias("x"), F.col("support").alias("sup_x"))
    s1b = one_items.select(F.col("item").alias("y"), F.col("support").alias("sup_y"))

    rules_ab = (
        pairs.join(s1, pairs.a == s1.x)
             .join(s1b, pairs.b == s1b.y)
             .select(
                 F.array("a").alias("antecedent"),
                 F.array("b").alias("consequent"),
                 "support_ab",
                 (F.col("support_ab")/F.col("sup_x")).alias("confidence"),
                 ((F.col("support_ab")/F.col("sup_x"))/F.col("sup_y")).alias("lift")
             )
             .where((F.col("support_ab")>=F.lit(MIN_SUPPORT)) & (F.col("confidence")>=F.lit(MIN_CONF)))
    )

    # --- 2.4 Regras 2→1 (A,B→C) se MAX_K>=3 ---
    if MAX_K >= 3:
        sup_ab = pairs.select(F.array_sort(F.array("a","b")).alias("ab"),
                              F.col("support_ab"))
        sup_c = one_items.select(F.col("item").alias("C"),
                                 F.col("support").alias("sup_c"))
        abc_split = (
            sup3.withColumn("abc_sorted", F.array_sort("items"))
                .withColumn("ab", F.expr("slice(abc_sorted, 1, 2)"))  # primeiros 2 como AB
                .withColumn("C",  F.col("abc_sorted")[2])              # terceiro como C (0-based -> [2])
                .select("ab","C", F.col("support").alias("support_abc"))
        )
        rules_2to1 = (
            abc_split.join(sup_ab, "ab").join(sup_c, "C")
                     .select(
                         F.col("ab").alias("antecedent"),
                         F.array("C").alias("consequent"),
                         "support_abc",
                         (F.col("support_abc")/F.col("support_ab")).alias("confidence"),
                         ((F.col("support_abc")/F.col("support_ab"))/F.col("sup_c")).alias("lift")
                     )
                     .where((F.col("support_abc")>=F.lit(MIN_SUPPORT)) & (F.col("confidence")>=F.lit(MIN_CONF)))
        )
    else:
        rules_2to1 = spark.createDataFrame([], schema="antecedent array<string>, consequent array<string>, support_abc double, confidence double, lift double")

    # --- 2.5 Consolidação (formato semelhante ao de FP-Growth) ---
    fp_rules = rules_ab.select(
        "antecedent", "consequent",
        F.col("support_ab").alias("support_ab"),
        "confidence", "lift"
    ).unionByName(
        rules_2to1.select(
            "antecedent", "consequent",
            F.col("support_abc").alias("support_ab"),
            "confidence", "lift"
        )
    )

    jstr = F.udf(lambda xs: ", ".join(xs) if xs else "", T.StringType())
    fp_rules_pretty = (fp_rules
                       .withColumn("antecedent_str", jstr("antecedent"))
                       .withColumn("consequent_str", jstr("consequent"))
                       .select("antecedent_str","consequent_str","support_ab","confidence","lift"))

# =========================
# Guardar resultados
# =========================
fp_itemsets.write.format("delta").mode("overwrite").saveAsTable("sc_gold.mba_fp_itemsets")
fp_rules_pretty.write.format("delta").mode("overwrite").saveAsTable("sc_gold.mba_fp_rules")

print("Gravado:")
print("- sc_gold.mba_fp_itemsets")
print("- sc_gold.mba_fp_rules")

display(spark.table("sc_gold.mba_fp_rules").orderBy(F.desc("lift"), F.desc("confidence")).limit(30))


#Regras A→B (confidence, lift)

In [0]:
from pyspark.sql import functions as F

RULES_1TO1 = "sc_gold.mba_modelos_rules"       # A→B já tens
TX_GROUPED = "sc_gold.deals_tx_grouped"        # transaction_id, models_list (se não existir, crio no passo 4)
TOP_N = 20                                     # top itens para gráficos
MIN_SUPPORT_ABC = 0.005                        # 0.5% p/ 2→1
MIN_CONF_2TO1 = 0.20                           # 20% p/ 2→1


In [0]:
# HEATMAP de co-ocorrência (lift) para regras A→B
# Ler regras 1→1
rules = (spark.table(RULES_1TO1)
              .select("antecedent","consequent","support_ab","confidence","lift"))

# Top modelos por frequência: usa support_a/ support_b se tiveres, senão top por lift/conf.
# Se a tua tabela já tiver support_a/support_b, substitui select acima e calcula top por esses.
top_models = (
    rules.select(F.col("antecedent").alias("m")).union(rules.select(F.col("consequent").alias("m")))
         .groupBy("m").count().orderBy(F.desc("count")).limit(TOP_N)
         .select("m").toPandas()["m"].tolist()
)

# Filtrar pares só entre top_models
rules_top = rules.where(F.col("antecedent").isin(top_models) & F.col("consequent").isin(top_models)) \
                 .select("antecedent","consequent","lift")

# Pivot para matriz
import pandas as pd, numpy as np
pdf = rules_top.toPandas()
if pdf.empty:
    raise ValueError("Sem regras suficientes para o heatmap. Ajusta TOP_N.")
mat = pdf.pivot_table(index="antecedent", columns="consequent", values="lift", fill_value=0)

# Plot heatmap (matplotlib only)
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
plt.imshow(mat.values)
plt.xticks(range(len(mat.columns)), mat.columns, rotation=90)
plt.yticks(range(len(mat.index)), mat.index)
plt.title(f"Lift (A→B) — Top {len(top_models)} modelos")
plt.colorbar(label="Lift")
plt.tight_layout()
plt.show()


In [0]:
#REDE (graph) simples de co-compra sem networkx (linhas ponderadas por lift)
# Pegar top E arestas por lift entre top modelos
E = 60  # nº máximo de arestas a desenhar
edges = (rules_top.orderBy(F.desc("lift")).limit(E)).toPandas()

# Colocar nós num círculo
nodes = sorted(set(edges["antecedent"]).union(set(edges["consequent"])))
import math
coords = {n: (math.cos(2*math.pi*i/len(nodes)), math.sin(2*math.pi*i/len(nodes))) for i,n in enumerate(nodes)}

# Desenhar
plt.figure(figsize=(10,10))
# desenha arestas
for _, r in edges.iterrows():
    x1,y1 = coords[r["antecedent"]]; x2,y2 = coords[r["consequent"]]
    lw = max(0.5, min(6.0, r["lift"]))  # espessura ~ lift (cap)
    plt.plot([x1,x2],[y1,y2], linewidth=lw, alpha=0.6)

# desenha nós e labels
for n,(x,y) in coords.items():
    plt.scatter([x],[y], s=150)
    plt.text(x, y, n, ha="center", va="center", fontsize=8)

plt.axis("off")
plt.title("Rede de co-ocorrência (espessura ~ lift)")
plt.show()


In [0]:
#“RECOMENDAR” com base nas regras A→B (e também 2→1 se disponível)
from typing import List

rules_df = spark.table(RULES_1TO1).select("antecedent","consequent","support_ab","confidence","lift")
rules_pdf = rules_df.toPandas()

def recommend_from_items(items: List[str], top_k=10, prefer="lift"):
    items_norm = set(items)
    # A→B: se antecedent em items
    cand = rules_pdf[rules_pdf["antecedent"].isin(items_norm)].copy()
    # Agregar por consequent (pode haver múltiplos A→mesmo B)
    agg = cand.groupby("consequent")[["support_ab","confidence","lift"]].agg("max").reset_index()
    return agg.sort_values(prefer, ascending=False).head(top_k)

# Exemplo:
display(recommend_from_items(["Ioniq 5"]).head(10))
# Se tiveres a tabela 2→1, podes escrever uma função para dois antecedentes:
# recommend_pair(["Ioniq 5","Ioniq 6"])


In [0]:
#Regras 2→1 (A,B→C) e heatmap 2→1 (gera e grava)
# 4.1 garantir tx_grouped; se não existir, construir a partir de deals_tx
if spark.catalog.tableExists(TX_GROUPED):
    tx_grouped = spark.table(TX_GROUPED)
else:
    # fallback: construir de sc_gold.deals_2
    base = (spark.table("sc_gold.deals_2")
            .select("conta_name","data_venda","modelos")
            .where(F.col("modelos").isNotNull()))
    deals_tx = (base.select(
                    "conta_name","data_venda","modelos",
                    F.concat_ws("_", "conta_name", F.date_format("data_venda","yyyyMMdd")).alias("transaction_id")
                ).dropDuplicates(["transaction_id","modelos"]))
    tx_grouped = (deals_tx.groupBy("transaction_id")
                        .agg(F.collect_set("modelos").alias("models_list"))
                        .where(F.size("models_list") > 1))

N = tx_grouped.count()

# 4.2 suportes 1 e pares
one_items = (tx_grouped.select(F.explode("models_list").alias("item"))
                        .groupBy("item").agg(F.count("*").alias("cnt"))
                        .withColumn("support", F.col("cnt")/F.lit(N)))

a = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("i","A"))
b = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("j","B"))
pairs = (a.join(b, "transaction_id").where(F.col("i")<F.col("j"))
           .groupBy("A","B").agg(F.count("*").alias("cnt"))
           .withColumn("support_ab", F.col("cnt")/F.lit(N))
           .withColumn("ab", F.array_sort(F.array("A","B"))))

# 4.3 triplos (A,B,C)
c = tx_grouped.select("transaction_id", F.posexplode("models_list").alias("k","C"))
triples = (a.join(b,"transaction_id").join(c,"transaction_id")
             .where((F.col("i")<F.col("j")) & (F.col("j")<F.col("k")))
             .select(F.array_sort(F.array("A","B")).alias("ab"), F.col("C")))

support_abc = (triples.groupBy("ab","C").agg(F.count("*").alias("cnt"))
                      .withColumn("support_abc", F.col("cnt")/F.lit(N)))

# 4.4 juntar e calcular confidence/lift de (A,B)→C
sup_c = one_items.select(F.col("item").alias("C"), F.col("support").alias("sup_c"))
rules_2to1 = (support_abc.join(pairs.select("ab","support_ab"), "ab", "inner")
                        .join(sup_c, "C", "inner")
                        .select(
                            "ab", "C",
                            "support_abc","support_ab","sup_c",
                            (F.col("support_abc")/F.col("support_ab")).alias("confidence"),
                            ((F.col("support_abc")/F.col("support_ab"))/F.col("sup_c")).alias("lift")
                        )
                        .where((F.col("support_abc")>=MIN_SUPPORT_ABC) & (F.col("confidence")>=MIN_CONF_2TO1))
                        .orderBy(F.desc("confidence"), F.desc("lift"), F.desc("support_abc"))
)

# Gravar
rules_2to1_out = (rules_2to1
                  .select(F.col("ab").alias("antecedent_AB"),
                          F.col("C").alias("consequent"),
                          "support_abc","confidence","lift"))
rules_2to1_out.write.format("delta").mode("overwrite").saveAsTable("sc_gold.mba_modelos_rules_2to1")

display(rules_2to1_out.limit(50))

# 4.5 Heatmap 2→1 (lift)
pdf2 = rules_2to1_out.toPandas()
if not pdf2.empty:
    pdf2["AB"] = pdf2["antecedent_AB"].apply(lambda xs: " + ".join(xs))
    top_ab = (pdf2.groupby("AB")["lift"].max().sort_values(ascending=False).head(15)).index.tolist()
    top_c  = (pdf2.groupby("consequent")["lift"].max().sort_values(ascending=False).head(15)).index.tolist()
    mat2 = pdf2[pdf2["AB"].isin(top_ab) & pdf2["consequent"].isin(top_c)] \
              .pivot_table(index="AB", columns="consequent", values="lift", fill_value=0)

    import matplotlib.pyplot as plt
    plt.figure(figsize=(14,8))
    plt.imshow(mat2.values)
    plt.xticks(range(len(mat2.columns)), mat2.columns, rotation=90)
    plt.yticks(range(len(mat2.index)), mat2.index)
    plt.title("Lift — Regras 2→1 (A,B→C)")
    plt.colorbar(label="Lift")
    plt.tight_layout()
    plt.show()


In [0]:
#“Story” — interpretação automática rápida dos teus resultados
rules = spark.table(RULES_1TO1).select("antecedent","consequent","support_ab","confidence","lift")
top = (rules.orderBy(F.desc("lift"), F.desc("confidence"), F.desc("support_ab")).limit(10)).toPandas()

print("TOP 10 REGRAS (ordenado por lift):")
for _,r in top.iterrows():
    print(f"- {r['antecedent']} → {r['consequent']} | lift={r['lift']:.2f}, conf={float(r['confidence']):.2f}, supAB={float(r['support_ab']):.3%}")

# alguns insights programáticos
strong = top[top["lift"]>=3]
if not strong.empty:
    print("\nRegras MUITO fortes (lift ≥ 3):")
    for _,r in strong.iterrows():
        print(f"  * {r['antecedent']} → {r['consequent']} (lift {r['lift']:.2f})")

ev_pairs = rules.where(F.lower("antecedent").like("%ioniq%") | F.lower("consequent").like("%ioniq%") |
                       F.lower("antecedent").like("%hev%")   | F.lower("consequent").like("%hev%")).count()
print(f"\nRegras envolvendo EV/HEV detetadas: {ev_pairs}")


B->A

In [0]:
#Parâmetros
from pyspark.sql import functions as F, Window as W

RULES_1TO1 = "sc_gold.mba_modelos_rules"     # tem: antecedent, consequent, support_ab, support_a, support_b, confidence, lift
DEALS      = "sc_gold.deals_2"               # para recomendações por cliente
TOP_K_PER_TRIGGER = 5                         # nº de recomendações por “modelo comprado”


In [0]:
#Construir regras B→A (invertidas) e gravar
rules_ab = spark.table(RULES_1TO1).select(
    "antecedent","consequent","support_ab","support_a","support_b","confidence","lift"
)

rules_ba = (
    rules_ab
      .select(
          F.col("consequent").alias("trigger_b"),     # “o que o cliente comprou”
          F.col("antecedent").alias("recommend_a"),   # “o que recomendo”
          F.col("support_ab"),
          F.col("support_b"),
          F.col("support_a"),
          (F.col("support_ab")/F.col("support_b")).alias("confidence_ba"),
          F.col("lift").alias("lift_ba")
      )
      .where( (F.col("support_b") > 0) & (F.col("support_ab") > 0) )
      .orderBy(F.desc("lift_ba"), F.desc("confidence_ba"), F.desc("support_ab"))
)

spark.sql("DROP TABLE IF EXISTS sc_gold.mba_modelos_rules_BA")

(rules_ba
 .write
 .format("delta")
 .mode("overwrite")
 .saveAsTable("sc_gold.mba_modelos_rules_BA"))

In [0]:

#Ranking por trigger (Top-K recomendações por “modelo comprado”)

# rank por trigger, priorizando lift e depois confidence
w = W.partitionBy("trigger_b").orderBy(F.desc("lift_ba"), F.desc("confidence_ba"), F.desc("support_ab"))

reco_by_trigger = (
    rules_ba
      .withColumn("rank", F.row_number().over(w))
      .where(F.col("rank") <= TOP_K_PER_TRIGGER)
)

reco_by_trigger.write.format("delta").mode("overwrite").saveAsTable("sc_gold.reco_modelo_by_trigger")
display(spark.table("sc_gold.reco_modelo_by_trigger").orderBy("trigger_b","rank").limit(50))


In [0]:
#Next Best Model por cliente (com base na última compra)
from pyspark.sql import functions as F, Window as W

df = spark.table(DEALS).select("conta_name","data_venda","modelos").where(
    F.col("conta_name").isNotNull() & F.col("data_venda").isNotNull() & F.col("modelos").isNotNull()
)

w_last = W.partitionBy("conta_name").orderBy(F.desc("data_venda"))
last_model = (
    df.withColumn("rn", F.row_number().over(w_last))
      .where(F.col("rn")==1)
      .select(F.col("conta_name"), F.col("modelos").alias("ultimo_modelo"))
)

reco = spark.table("sc_gold.reco_modelo_by_trigger")

reco_cliente = (
    last_model.join(reco, last_model.ultimo_modelo == reco.trigger_b, "left")
              .select(
                  "conta_name",
                  "ultimo_modelo",
                  F.col("recommend_a").alias("next_best_model"),
                  "rank",
                  "confidence_ba",
                  "lift_ba"
              )
              .orderBy("conta_name","rank")
)

reco_cliente.write.format("delta").mode("overwrite").saveAsTable("sc_gold.reco_modelo_cliente_topK")
display(spark.table("sc_gold.reco_modelo_cliente_topK").limit(50))


In [0]:
#Heatmap B→A (lift) com matplotlib (sem seaborn)
import pandas as pd
import matplotlib.pyplot as plt

# escolher top triggers e top recomendações pelos melhores lifts
rb = spark.table("sc_gold.mba_modelos_rules_BA")

TOP_TRIGGERS = 15
TOP_RECS     = 15

top_triggers = (rb.groupBy("trigger_b").agg(F.max("lift_ba").alias("mx"))
                  .orderBy(F.desc("mx")).limit(TOP_TRIGGERS)
                  .select("trigger_b").toPandas()["trigger_b"].tolist())

top_recs = (rb.groupBy("recommend_a").agg(F.max("lift_ba").alias("mx"))
              .orderBy(F.desc("mx")).limit(TOP_RECS)
              .select("recommend_a").toPandas()["recommend_a"].tolist())

rb_top = (rb.where(F.col("trigger_b").isin(top_triggers) & F.col("recommend_a").isin(top_recs))
            .select("trigger_b","recommend_a","lift_ba"))

pdf = rb_top.toPandas()
if pdf.empty:
    raise ValueError("Sem dados suficientes para o heatmap. Aumenta TOP_TRIGGERS/TOP_RECS.")

pivot = pdf.pivot_table(index="trigger_b", columns="recommend_a", values="lift_ba", fill_value=0)

plt.figure(figsize=(14,8))
plt.imshow(pivot.values)
plt.xticks(range(len(pivot.columns)), pivot.columns, rotation=90)
plt.yticks(range(len(pivot.index)), pivot.index)
plt.title("Heatmap B→A (Lift) — Trigger: modelo comprado (B)  |  Recomendação: A")
plt.colorbar(label="Lift")
plt.tight_layout()
plt.show()


In [0]:
#Tabela para Marketing (limpa, 1 linha por trigger & rank)
marketing = (
    spark.table("sc_gold.reco_modelo_by_trigger")
         .select(
             F.col("trigger_b").alias("modelo_comprado"),
             F.col("rank").alias("rank_reco"),
             F.col("recommend_a").alias("modelo_recomendado"),
             F.round("confidence_ba", 4).alias("confidence"),
             F.round("lift_ba", 4).alias("lift"),
             F.round("support_ab", 4).alias("support_ab"),
             F.round("support_b", 4).alias("support_b")
         )
         .orderBy("modelo_comprado","rank_reco")
)

marketing.write.format("delta").mode("overwrite").saveAsTable("sc_gold.marketing_next_best_model")
display(spark.table("sc_gold.marketing_next_best_model").limit(50))
