**Viaturas **table****

In [0]:
%sql
SELECT * FROM sc_gold.viaturas

In [0]:
%sql

DROP TABLE IF EXISTS sc_gold.viaturas_2;

CREATE TABLE sc_gold.viaturas_2 AS
SELECT id,modelo,motorizacao,data_de_matricula,cilindrada__cm3_,potencia_maxima__kw_,combustivel,production_date,gwms_engine
FROM sc_gold.viaturas;

In [0]:
%sql
SELECT * FROM sc_gold.viaturas_2

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window as W
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType, DecimalType
from pyspark.sql.functions import split

In [0]:
table_name = "sc_gold.viaturas_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna modelo -486 linhas)
table_name = "sc_gold.viaturas_2"
df0 = spark.table(table_name)


# Aplicar o filtro (excluir linhas vazias na coluna modelo -486 linhas)
df = df0.filter(
    (F.col("modelo").isNotNull())
    & (F.trim(F.col("modelo")) != "")
    & (F.lower(F.trim(F.col("modelo"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

In [0]:
#descobrir o numero de dias em media de diferen√ßa entre a data de produ√ß√£o e a data da matricula

# Carregar a tabela
df = spark.table("sc_gold.viaturas_2")

# Converter para DATE (com v√°rios formatos tolerados, se necess√°rio)
df = df.withColumn(
    "data_de_matricula_dt",
    F.coalesce(
        F.expr("try_to_date(data_de_matricula, 'dd-MM-yyyy')"),
        F.expr("try_to_date(data_de_matricula, 'yyyy-MM-dd')"),
        F.expr("try_to_date(data_de_matricula, 'dd/MM/yyyy')")
    )
).withColumn(
    "production_date_dt",
    F.coalesce(
        F.expr("try_to_date(production_date, 'dd-MM-yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy-MM-dd')"),
        F.expr("try_to_date(production_date, 'dd/MM/yyyy')")
    )
)

# Remover linhas com valores nulos em qualquer das datas
df_valid = df.filter(F.col("data_de_matricula_dt").isNotNull() & F.col("production_date_dt").isNotNull())

# Calcular diferen√ßa em dias
df_valid = df_valid.withColumn("diff_days", 
                               F.datediff(F.col("data_de_matricula_dt"), F.col("production_date_dt")))

# Somat√≥rio e m√©dia
agg = df_valid.agg(
    F.sum("diff_days").alias("soma_dias"),
    F.count("diff_days").alias("n_linhas"),
    F.avg("diff_days").alias("media_dias")
).collect()[0]

print("üîπ Soma total dos dias:", agg["soma_dias"])
print("üîπ N√∫mero de linhas usadas:", agg["n_linhas"])
print("üîπ M√©dia de dias:", round(agg["media_dias"], 2))

In [0]:
#substituir data de produ√ß√£o e a data da matricula +/-131 com base no calculo anterior

df = spark.table("sc_gold.viaturas_2")

# Tenta converter em v√°rios formatos comuns
df = df.withColumn(
    "data_de_matricula",
    F.coalesce(
        F.expr("try_to_date(data_de_matricula, 'dd-MM-yyyy')"),
        F.expr("try_to_date(data_de_matricula, 'yyyy-MM-dd')"),
        F.expr("try_to_date(data_de_matricula, 'dd/MM/yyyy')"),
        F.expr("try_to_date(data_de_matricula, 'yyyy/MM/dd')")
    )
)

df = df.withColumn(
    "production_date",
    F.coalesce(
        F.expr("try_to_date(production_date, 'dd-MM-yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy-MM-dd')"),
        F.expr("try_to_date(production_date, 'dd/MM/yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy/MM/dd')")
    )
)

# Aplicar as regras dos ¬±131 dias
df = df.withColumn(
    "production_date",
    F.when(F.col("production_date").isNull() & F.col("data_de_matricula").isNotNull(),
           F.date_sub(F.col("data_de_matricula"), 131))
     .otherwise(F.col("production_date"))
).withColumn(
    "data_de_matricula",
    F.when(F.col("data_de_matricula").isNull() & F.col("production_date").isNotNull(),
           F.date_add(F.col("production_date"), 131))
     .otherwise(F.col("data_de_matricula"))
)

# Reescrever a tabela (permitindo schema overwrite se necess√°rio)
(df.write
   .format("delta")
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
#criar nova coluna que √© o ano de produ√ß√£o do carro
# Carregar tabela
df = spark.table("sc_gold.viaturas_2")

# Converter production_date para DATE (caso ainda seja string)
df = df.withColumn(
    "production_date_dt",
    F.coalesce(
        F.expr("try_to_date(production_date, 'dd-MM-yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy-MM-dd')"),
     F.expr("try_to_date(production_date, 'dd/MM/yyyy')")
    )
)

# Extrair o ano de produ√ß√£o
df = df.withColumn("production_year", F.year("production_date_dt"))

# Calcular idade em anos at√© a data de hoje
df = df.withColumn(
    "age_year",
    F.floor(F.datediff(F.current_date(), F.col("production_date_dt")) / 365.25)
)

# Regravar a tabela com as novas colunas
(df.write
   .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")  # garante que aceita as novas colunas
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# calcular a m√©dia de cilindrada (cilindrada__cm3_) por (gwms_engine + modelo + motoriza√ß√£o)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Janela por atributos do grupo
w = W.partitionBy("gwms_engine", "motorizacao", "modelo")

# 3) m√©dia global (fallback) ‚Äî j√° arredondada para inteiro
global_avg = df.select(F.avg("cilindrada__cm3_").alias("g")).first()["g"]
if global_avg is not None:
    global_avg = int(round(global_avg))

# 4) Preencher nulos com a m√©dia do grupo arredondada (ou fallback global)
df_filled = (
    df
    .withColumn("avg_grupo", F.avg("cilindrada__cm3_").over(w))
    .withColumn(
        "cilindrada__cm3_",
        F.when(
            F.col("cilindrada__cm3_").isNull(),
            F.coalesce(F.round(F.col("avg_grupo")).cast(IntegerType()), F.lit(global_avg))
        ).otherwise(F.col("cilindrada__cm3_").cast(IntegerType()))
    )
    .drop("avg_grupo")
)


# 5) Escrever o RESULTADO correto
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# calcular a m√©dia de potencia (potencia_maxima__kw_) por (gwms_engine + modelo + motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Limpeza e cast:
#    - troca v√≠rgula decimal por ponto
#    - remove qualquer caractere n√£o num√©rico (p.ex. ' kW', espa√ßos, etc.)
pot_clean = F.regexp_replace(F.col("potencia_maxima__kw_"), ",", ".")
pot_clean = F.regexp_replace(pot_clean, r"[^0-9.]", "")
df = df.withColumn("potencia_maxima__kw_", pot_clean.cast(DoubleType()))

# 3) M√©dia global (fallback), arredondada a 1 casa
global_avg = df.select(F.avg("potencia_maxima__kw_").alias("g")).first()["g"]
global_avg_1d = round(global_avg, 1) if global_avg is not None else None

# 4) M√©dia por grupo
keys = ["gwms_engine", "motorizacao", "modelo"]
avg_by_group = (
    df.groupBy(*keys)
      .agg(F.avg("potencia_maxima__kw_").alias("avg_grp"))
)

# 5) Preencher apenas nulos com a m√©dia do grupo (1 casa decimal);
#    se o grupo for todo nulo, usa m√©dia global
df_filled = (
    df.join(avg_by_group, on=keys, how="left")
      .withColumn(
          "potencia_maxima__kw_",
          F.when(
              F.col("potencia_maxima__kw_").isNull(),
              F.coalesce(F.round(F.col("avg_grp"), 1), F.lit(global_avg_1d))
          ).otherwise(F.col("potencia_maxima__kw_"))
      )
      .drop("avg_grp")
)

# (Opcional) Se quiser NORMALIZAR toda a coluna para 1 casa decimal, inclusive n√£o nulos:
# df_filled = df_filled.withColumn("potencia_maxima__kw_", F.round(F.col("potencia_maxima__kw_"), 1))

# (Opcional) Fixar o tipo para Decimal(10,1) no schema (em vez de double):
# df_filled = df_filled.withColumn("potencia_maxima__kw_", F.col("potencia_maxima__kw_").cast(DecimalType(10,1)))

# 6) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# Preencher 'motorizacao' com a MODA por grupo (gwms_engine, modelo, potencia_maxima__kw_, combustivel)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Pot√™ncia: v√≠rgula -> ponto, remover ru√≠do e cast para double (porque √© chave do grupo)
pot = F.regexp_replace(F.col("potencia_maxima__kw_"), ",", ".")
pot = F.regexp_replace(pot, r"[^0-9.]", "")
df = df.withColumn("potencia_maxima__kw_", pot.cast(DoubleType()))

# 3) Higienizar texto relevante
for col in ["motorizacao", "combustivel", "gwms_engine", "modelo"]:
    df = df.withColumn(col, F.trim(F.col(col)))
df = df.withColumn("motorizacao", F.when(F.col("motorizacao") == "", None).otherwise(F.col("motorizacao")))
df = df.withColumn("combustivel", F.when(F.col("combustivel") == "", None).otherwise(F.col("combustivel")))
df = df.withColumn("gwms_engine", F.when(F.col("gwms_engine") == "", None).otherwise(F.col("gwms_engine")))

# 4) Chaves do grupo
keys_mot = ["gwms_engine", "modelo", "potencia_maxima__kw_", "combustivel"]

# 5) Moda de 'motorizacao' por grupo (desempate alfab√©tico)
counts_mot = (
    df.filter(F.col("motorizacao").isNotNull())
      .groupBy(*keys_mot, "motorizacao")
      .agg(F.count(F.lit(1)).alias("cnt"))
)
w_mot = W.partitionBy(*keys_mot).orderBy(F.col("cnt").desc(), F.col("motorizacao").asc())
mode_motorizacao = (
    counts_mot.withColumn("rn", F.row_number().over(w_mot))
              .filter(F.col("rn") == 1)
              .select(*keys_mot, F.col("motorizacao").alias("mode_motorizacao"))
)

# 6) Moda global de 'motorizacao' (fallback opcional)
row_mot = (
    df.filter(F.col("motorizacao").isNotNull())
      .groupBy("motorizacao").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("motorizacao").asc())
      .limit(1).first()
)
global_mode_mot = row_mot["motorizacao"] if row_mot else None

# 7) Preencher APENAS nulos de 'motorizacao' com a moda do grupo (ou global)
df_filled = (
    df.join(mode_motorizacao, on=keys_mot, how="left")
      .withColumn(
          "motorizacao",
          F.when(F.col("motorizacao").isNull(),
                 F.coalesce(F.col("mode_motorizacao"), F.lit(global_mode_mot)))
           .otherwise(F.col("motorizacao"))
      )
      .drop("mode_motorizacao")
)

# 8) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
# Preencher 'gwms_engine' pela MODA por grupo (cilindrada__cm3_, potencia_maxima__kw_, modelo, motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Higienizar num√©ricos (v√≠rgula -> ponto; remover ru√≠do) e fazer cast
cil_clean = F.regexp_replace(F.col("cilindrada__cm3_"), ",", ".")
cil_clean = F.regexp_replace(cil_clean, r"[^0-9.]", "")
df = df.withColumn("cilindrada__cm3_", cil_clean.cast(DoubleType()))

pot_clean = F.regexp_replace(F.col("potencia_maxima__kw_"), ",", ".")
pot_clean = F.regexp_replace(pot_clean, r"[^0-9.]", "")
df = df.withColumn("potencia_maxima__kw_", pot_clean.cast(DoubleType()))

# 3) Higienizar texto: trim e strings vazias -> NULL
df = df.withColumn("gwms_engine", F.when(F.trim(F.col("gwms_engine")) == "", None)
                                  .otherwise(F.trim(F.col("gwms_engine"))))
df = df.withColumn("modelo", F.trim(F.col("modelo")))
# "motorizacao" sem acento ‚Äî confirme o nome exato na tabela
df = df.withColumn("motorizacao", F.when(F.trim(F.col("motorizacao")) == "", None)
                                   .otherwise(F.trim(F.col("motorizacao"))))

# 4) Chaves do grupo
keys = ["cilindrada__cm3_", "potencia_maxima__kw_", "modelo", "motorizacao"]

# 5) Calcular a MODA de gwms_engine por grupo
counts = (
    df.filter(F.col("gwms_engine").isNotNull())
      .groupBy(*keys, "gwms_engine")
      .agg(F.count(F.lit(1)).alias("cnt"))
)

w = W.partitionBy(*keys).orderBy(F.col("cnt").desc(), F.col("gwms_engine").asc())

mode_by_group = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select(*keys, F.col("gwms_engine").alias("mode_gwms_engine"))
)

# 6) (Opcional) Moda GLOBAL como fallback se o grupo n√£o tiver ocorr√™ncias v√°lidas
row_global = (
    df.filter(F.col("gwms_engine").isNotNull())
      .groupBy("gwms_engine").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("gwms_engine").asc())
      .limit(1).first()
)
global_mode = row_global["gwms_engine"] if row_global else None

# 7) Preencher APENAS nulos com a moda do grupo (ou moda global se necess√°rio)
df_filled = (
    df.join(mode_by_group, on=keys, how="left")
      .withColumn(
          "gwms_engine",
          F.when(
              F.col("gwms_engine").isNull(),
              F.coalesce(F.col("mode_gwms_engine"), F.lit(global_mode))
          ).otherwise(F.col("gwms_engine"))
      )
      .drop("mode_gwms_engine")
)

# 8) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# MODA de combustivel por (gwms_engine, cilindrada__cm3_, modelo, motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Higienizar colunas num√©ricas (garantir double)
cil = F.regexp_replace(F.col("cilindrada__cm3_"), ",", ".")
cil = F.regexp_replace(cil, r"[^0-9.]", "")
df = df.withColumn("cilindrada__cm3_", cil.cast(DoubleType()))

# 3) Higienizar colunas de texto
df = df.withColumn("gwms_engine", F.when(F.length(F.trim("gwms_engine")) == 0, None)
                                   .otherwise(F.trim(F.col("gwms_engine")).cast("string")))
df = df.withColumn("modelo", F.trim(F.col("modelo")).cast("string"))
df = df.withColumn("motorizacao", F.when(F.length(F.trim("motorizacao")) == 0, None)
                                   .otherwise(F.trim(F.col("motorizacao")).cast("string")))
df = df.withColumn("combustivel", F.when(F.length(F.trim("combustivel")) == 0, None)
                                   .otherwise(F.trim(F.col("combustivel")).cast("string")))

# 4) Chave do grupo
keys = ["gwms_engine", "cilindrada__cm3_", "modelo", "motorizacao"]

# 5) Moda de combustivel por grupo
counts = (
    df.filter(F.col("combustivel").isNotNull())
      .groupBy(*keys, "combustivel")
      .agg(F.count(F.lit(1)).alias("cnt"))
)

w = W.partitionBy(*keys).orderBy(F.col("cnt").desc(), F.col("combustivel").asc())

mode_by_group = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select(*keys, F.col("combustivel").alias("mode_combustivel"))
)

# 6) Moda global como fallback
row_global = (
    df.filter(F.col("combustivel").isNotNull())
      .groupBy("combustivel").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("combustivel").asc())
      .limit(1).first()
)
global_mode = row_global["combustivel"] if row_global else None

# 7) Preencher apenas nulos
df_filled = (
    df.join(mode_by_group, on=keys, how="left")
      .withColumn(
          "combustivel",
          F.when(
              F.col("combustivel").isNull(),
              F.coalesce(F.col("mode_combustivel"), F.lit(global_mode))
          ).otherwise(F.col("combustivel"))
      )
      .drop("mode_combustivel")
)

# 8) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
# Passo usado para remover nulos das datas (confirmar)
#  Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# Remover linhas onde production_date √© NULL
df_clean = df.filter(F.col("age_year").isNotNull())


# (opcional) sobrescrever a tabela com o dataset limpo
(df_clean.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# Passo usado para remover coluna "production_year" vai distorcer o clustering porque est√°s a ‚Äúcontar duas vezes‚Äù o mesmo fator "age_year". Alem disso removemos "data_de_matricula","production_date" pois nao √© bom usar datas para os clusters

#  Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# Remover a coluna 'production_date_dt'
df_clean = df.drop("production_year","data_de_matricula","production_date","production_date_dt",'gwms_engine')

# (opcional) sobrescrever a tabela com o dataset limpo
(df_clean.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# Converter age_year para inteiro
df = df.withColumn("age_year", F.col("age_year").cast(IntegerType()))

# Guardar a tabela sobrescrevendo a anterior
(df.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

# Confirmar no DF atualizado
df.select("age_year").distinct().show(20)
df.printSchema()

In [0]:
#colocar anos em intervalos

df = df.withColumn(
    "age_interval",
    F.when(F.col("age_year") >= 20, "20+")
     .otherwise(
         F.concat(
             (F.floor(F.col("age_year") / 5) * 5).cast("string"),
             F.lit("-"),
             ((F.floor(F.col("age_year") / 5) * 5) + 4).cast("string")
         )
     )
)


(df.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
#escolher primeira palavra da coluna modelo para diminuir a granularidade desta coluna
#  Ler a tabela
#df = spark.table("sc_gold.viaturas_2")

#df = df.withColumn("modelo", F.split(F.col("modelo"), " ").getItem(0))

#(df.write
#   .mode("overwrite")
 #  .option("overwriteSchema", "true")
 #  .saveAsTable("sc_gold.viaturas_2"))


In [0]:
#excluir modelos inv√°lidos
# Lista de valores a excluir
valores_a_remover = [
    "Outro",
    "Outras Marcas",
    "Modelo",
    "900",
    "Jobs",
    "Fdh",
    "Pdei49ggyipaetry",
    "Fs",
    "800mt",
    "Fct",
    "Gsc4d261f"
]

# Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# Filtrar removendo essas linhas
df = df.filter(~F.col("modelo").isin(valores_a_remover))

# Se quiseres guardar ou mostrar:
(df.write 
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


Para ve√≠culos a combust√£o (Gasolina, Diesel, H√≠bridos HEV/PHEV, GPL, etc.)
Classe	Intervalo (kW)	Equivalente em cv
Baixa	< 75 kW	< 100 cv
M√©dia	75‚Äì110 kW	100‚Äì150 cv
Alta	> 110 kW	> 150 cv
Para ve√≠culos 100% el√©tricos (combustivel = 'El√©trico' ou equivalente)
Classe	Intervalo (kW)	Observa√ß√£o
Baixa	< 100 kW	Ex.: Ioniq Electric 28kWh (88 kW)
M√©dia	100‚Äì150 kW	Ex.: Kona EV 64kWh (150 kW)
Alta	> 150 kW	Ex.: Ioniq 5, Tesla, EVs potentes

In [0]:
from pyspark.sql import functions as F

df = spark.table("sc_gold.viaturas_2")

df = df.withColumn(
    "classe_potencia",
    F.when(
        (F.col("combustivel").isin("El√©trico", "Eletrico", "EV", "BEV")) & 
        (F.col("potencia_maxima__kw_") < 100),
        "Baixa"
    ).when(
        (F.col("combustivel").isin("El√©trico", "Eletrico", "EV", "BEV")) & 
        (F.col("potencia_maxima__kw_").between(100, 150)),
        "M√©dia"
    ).when(
        (F.col("combustivel").isin("El√©trico", "Eletrico", "EV", "BEV")) & 
        (F.col("potencia_maxima__kw_") > 150),
        "Alta"
    ).when(F.col("potencia_maxima__kw_") < 75, "Baixa"
    ).when(F.col("potencia_maxima__kw_").between(75, 110), "M√©dia"
    ).when(F.col("potencia_maxima__kw_") > 110, "Alta"
    ).otherwise("Desconhecida")
)

df.select("potencia_maxima__kw_", "combustivel", "classe_potencia").show(20)

# Se quiseres guardar ou mostrar:
(df.write 
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
from pyspark.sql import functions as F

table_name = "sc_gold.viaturas_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

**Historico de servi√ßos table**

In [0]:
%sql

SELECT * FROM workspace.sc_gold.historico_de_servicos

In [0]:
%sql
DROP TABLE IF EXISTS sc_gold.historico_de_servicos_2;
CREATE TABLE sc_gold.historico_de_servicos_2 AS
SELECT viatura,descricao_servico_pos_venda,tipo_de_servico
FROM sc_gold.historico_de_servicos;

In [0]:
%sql

SELECT * FROM workspace.sc_gold.historico_de_servicos_2

In [0]:
%sql
CREATE OR REPLACE TABLE workspace.sc_gold.historico_de_servicos_2 AS
SELECT *
FROM workspace.sc_gold.historico_de_servicos_2
WHERE tipo_de_servico IS NOT NULL

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna tipo_de_servico)
table_name = "sc_gold.historico_de_servicos_2"
df0 = spark.table(table_name)


# Aplicar o filtro (excluir linhas vazias na coluna tipo_de_servico)
df = df0.filter(
    (F.col("tipo_de_servico").isNotNull())
    & (F.trim(F.col("tipo_de_servico")) != "")
    & (F.lower(F.trim(F.col("tipo_de_servico"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna viatura)
table_name = "sc_gold.historico_de_servicos_2"
df0 = spark.table(table_name)


# Aplicar o filtro (excluir linhas vazias na coluna viatura)
df = df0.filter(
    (F.col("viatura").isNotNull())
    & (F.trim(F.col("viatura")) != "")
    & (F.lower(F.trim(F.col("viatura"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

In [0]:
df = spark.table("workspace.sc_gold.historico_de_servicos_2")

In [0]:
from pyspark.sql import functions as F

table_name = "workspace.sc_gold.historico_de_servicos_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)
#os nulos na coluna descricao_servico_pos_venda n√£o √£o relevantes pois em principio nao ser√° uma coluna usada

In [0]:
%sql
-- Count of distinct descriptions
SELECT COUNT(DISTINCT tipo_de_servico) AS distinct_count
FROM workspace.sc_gold.historico_de_servicos_2;

-- Show the distinct text values
SELECT DISTINCT tipo_de_servico
FROM workspace.sc_gold.historico_de_servicos_2
ORDER BY tipo_de_servico;

In [0]:
#fazer join entre viatura e historico de servicos

# %sql
# CREATE OR REPLACE TABLE workspace.sc_gold.join_viatura_historico AS
# SELECT
#   v.*,
#   h.*
# FROM sc_gold.viaturas_2 AS v
# LEFT JOIN workspace.sc_gold.historico_de_servicos_2 AS h
#   ON v.id = h.viatura;

spark.sql("""
CREATE OR REPLACE TABLE workspace.sc_gold.join_viatura_historico AS
SELECT
  v.*,
  h.*
FROM sc_gold.viaturas_2 AS v
LEFT JOIN workspace.sc_gold.historico_de_servicos_2 AS h
  ON v.id = h.viatura
""")

In [0]:
#  Ler a tabela
df = spark.table("sc_gold.join_viatura_historico")

# Remover a coluna 'production_date_dt'
df_clean = df.drop("viatura","id",'descricao_servico_pos_venda','age_year','cilindrada__cm3_', 'motorizacao','potencia_maxima__kw_')

# (opcional) sobrescrever a tabela com o dataset limpo
(df_clean.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("sc_gold.join_viatura_historico"))

In [0]:
%sql
SELECT * FROM workspace.sc_gold.join_viatura_historico

In [0]:
#entender nulos depois do join
from pyspark.sql import functions as F

# Load the new joined table
table_name = "workspace.sc_gold.join_viatura_historico"
df = spark.table(table_name)

# Count total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100).alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna tipo_de_servio) pois o join nao trouxe resultados 98 879 linhas excluidas
table_name = "sc_gold.join_viatura_historico"
df0 = spark.table(table_name)


# Aplicar o filtro (excluir linhas vazias na coluna modelo -486 linhas)
df = df0.filter(
    (F.col("tipo_de_servico").isNotNull())
    & (F.trim(F.col("tipo_de_servico")) != "")
    & (F.lower(F.trim(F.col("tipo_de_servico"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)

In [0]:
# Load the existing Delta table into a Spark DataFrame
df = spark.table("workspace.sc_gold.join_viatura_historico")

# Preview the data
display(df)


new version

In [0]:
from pyspark.sql import functions as F

# Fonte
df = spark.table("workspace.sc_gold.join_viatura_historico")

GROUP_COLS = ["modelo", "combustivel", "age_interval", "classe_potencia"]
SERVICE_COL = "tipo_de_servico"

# 1) Manter s√≥ linhas v√°lidas e remover duplicados por grupo+servi√ßo
df_distinct = (
    df.select(*GROUP_COLS, SERVICE_COL)
      .where(F.col(SERVICE_COL).isNotNull())
      .dropDuplicates(GROUP_COLS + [SERVICE_COL])
)

# 2) Pivot: uma coluna por servi√ßo (count==1 por design ap√≥s dropDuplicates)
pivot_df = (
    df_distinct
      .groupBy(*GROUP_COLS)
      .pivot(SERVICE_COL)
      .agg(F.count(F.lit(1)))
      .fillna(0)
)

# 3) Garantir bin√°rio (int) em todas as colunas de servi√ßo
service_cols = [c for c in pivot_df.columns if c not in GROUP_COLS]
for c in service_cols:
    # colunas com espa√ßos/acentos existem; usa backticks para referenciar sem erro
    pivot_df = pivot_df.withColumn(f"{c}", (F.col(f"`{c}`") > 0).cast("int"))

flags_df = pivot_df

display(flags_df)
flags_df.write.format("delta").mode("overwrite").saveAsTable(
    "workspace.sc_gold.join_viatura_historico_flags"
)


In [0]:

 %pip install mlxtend

In [0]:
# --- 1) Imports ---
import pandas as pd
from pyspark.sql import functions as F
from mlxtend.frequent_patterns import apriori, association_rules

# --- 2) Ler e preparar em Spark ---
group_cols = ["modelo", "combustivel", "age_interval", "classe_potencia"]
target_col = "tipo_de_servico"

df_s = (
    spark.table("workspace.sc_gold.join_viatura_historico")
         .select(*(group_cols + [target_col]))
)

# remover nulos nas colunas chave e alvo
df_s = df_s.na.drop(subset=group_cols + [target_col])

# normalizar para string (evita warnings mais √† frente)
df_s = df_s.withColumn(target_col, F.col(target_col).cast("string"))

# --- 3) Construir transa√ß√µes em Spark (um basket por combina√ß√£o de atributos) ---
transactions_s = (
    df_s.groupBy(*[F.col(c) for c in group_cols])
        .agg(
            F.array_sort(
                F.array_distinct(F.collect_list(F.col(target_col)))
            ).alias("items")
        )
)

# Se quiseres ver amostra:
# transactions_s.show(truncate=False)

# --- 4) Converter apenas a coluna 'items' para Pandas ---
transactions = transactions_s.select("items").toPandas()
transactions_list = [list(x) for x in transactions["items"]]
num_tx = len(transactions_list)

# --- 5) One-hot encode (matriz transa√ß√µes x itens) ---
all_items = sorted({it for ts in transactions_list for it in ts})
ohe = pd.DataFrame(0, index=range(num_tx), columns=all_items, dtype=int)
for i, items in enumerate(transactions_list):
    for it in items:
        ohe.iat[i, ohe.columns.get_loc(it)] = 1

# --- 6) Apriori + Regras ---
min_support = 0.05      # ajusta conforme o n¬∫ de cestos
min_confidence = 0.6

freq = apriori(ohe, min_support=min_support, use_colnames=True)
rules = association_rules(freq, metric="confidence", min_threshold=min_confidence)

# manter apenas consequente unit√°rio (um √∫nico tipo_de_servico como ‚Äúprevis√£o‚Äù)
rules = rules[rules["consequents"].apply(lambda s: len(s) == 1)].copy()

# ordenar por lift/conf/sup
rules = rules.sort_values(["lift", "confidence", "support"], ascending=False).reset_index(drop=True)

print("Transa√ß√µes:", num_tx)
print("Itemsets frequentes:", len(freq))
print("Regras:", len(rules))
rules.head(20)



BLOCO 44 IGUAL AO 43 MAS COM GRUPOS INCLUIDOS

In [0]:
# --- 1) Imports ---
import pandas as pd
from pyspark.sql import functions as F
from mlxtend.frequent_patterns import apriori, association_rules

# --- 2) Par√¢metros ---
group_cols = ["modelo", "combustivel", "age_interval", "classe_potencia"]
target_col = "tipo_de_servico"
min_support = 0.05
min_confidence = 0.6

# --- 3) Ler e preparar em Spark ---
df_s = (
    spark.table("workspace.sc_gold.join_viatura_historico")
         .select(*(group_cols + [target_col]))
         .na.drop(subset=group_cols + [target_col])
         .withColumn(target_col, F.col(target_col).cast("string"))
)

# --- 4) Construir transa√ß√µes em Spark (um basket por combina√ß√£o de atributos) ---
transactions_s = (
    df_s.groupBy(*[F.col(c) for c in group_cols])
        .agg(F.array_sort(F.array_distinct(F.collect_list(F.col(target_col)))).alias("items"))
)

# Anexar um ID est√°vel para alinhar com o OHE em pandas
transactions_s = transactions_s.withColumn("tx_id", F.monotonically_increasing_id())

# Guardar meta-dados dos cestos (para mostrar grupos depois)
tx_meta_pdf = (
    transactions_s.select("tx_id", *group_cols, "items")
                  .orderBy("tx_id")
                  .toPandas()
)

# --- 5) Converter items para listas e construir OHE (bool) ---
transactions_list = tx_meta_pdf["items"].apply(list).tolist()
num_tx = len(transactions_list)

all_items = sorted({it for ts in transactions_list for it in ts})
ohe = pd.DataFrame(False, index=tx_meta_pdf["tx_id"], columns=all_items)  # bool desde in√≠cio
for tx_id, items in zip(tx_meta_pdf["tx_id"], transactions_list):
    for it in items:
        ohe.at[tx_id, it] = True

# --- 6) Apriori + Regras ---
freq = apriori(ohe, min_support=min_support, use_colnames=True)
rules = association_rules(freq, metric="confidence", min_threshold=min_confidence)

# manter apenas consequente unit√°rio
rules = rules[rules["consequents"].apply(lambda s: len(s) == 1)].copy()

# ordenar por lift/conf/sup
rules = rules.sort_values(["lift", "confidence", "support"], ascending=False).reset_index(drop=True)

print("Transa√ß√µes:", num_tx)
print("Itemsets frequentes:", len(freq))
print("Regras:", len(rules))

# --- 7) Mapear cada regra aos cestos (grupos) onde ela acontece ---
def baskets_mask_for_itemset(itemset_frozenset):
    """M√°scara booleana (index=tx_id) onde TODOS os itens do itemset est√£o presentes no cesto."""
    cols = list(itemset_frozenset)
    if len(cols) == 0:
        return pd.Series(False, index=ohe.index)
    # if algum item n√£o existe nas colunas (raro), devolve tudo False
    missing = [c for c in cols if c not in ohe.columns]
    if missing:
        return pd.Series(False, index=ohe.index)
    return ohe[cols].all(axis=1)

# Suporte (antecedente ‚à™ consequente) -> cestos onde a regra de facto ocorre
support_masks = []
support_counts = []
examples = []

max_examples = 10  # quantos grupos mostrar por regra (amostra)

for _, r in rules.iterrows():
    ant = r["antecedents"]
    cons = r["consequents"]
    both = frozenset(set(ant) | set(cons))
    m = baskets_mask_for_itemset(both)
    idx = ohe.index[m]  # tx_id dos cestos onde a regra ocorre
    support_masks.append(m)
    support_counts.append(int(m.sum()))
    # amostra de grupos (at√© max_examples)
    sample_ids = list(idx[:max_examples])
    sample_rows = (
        tx_meta_pdf[tx_meta_pdf["tx_id"].isin(sample_ids)][["tx_id"] + group_cols]
        .to_dict(orient="records")
    )
    examples.append(sample_rows)

# anexar √†s regras
rules["group_count_support"] = support_counts
rules["groups_example_support"] = examples

# (Opcional) tamb√©m podes querer ver onde o ANTECEDENTE (sem exigir o consequente) aparece:
ante_counts = []
for _, r in rules.iterrows():
    m_ant = baskets_mask_for_itemset(r["antecedents"])
    ante_counts.append(int(m_ant.sum()))
rules["group_count_antecedent"] = ante_counts

# Mostrar top 10 com exemplos de grupos
cols_to_show = ["antecedents", "consequents", "support", "confidence", "lift",
                "group_count_support", "group_count_antecedent", "groups_example_support"]
rules[cols_to_show].head(20)


In [0]:
rule_idx = 0
ant = rules.loc[rule_idx, "antecedents"]
cons = rules.loc[rule_idx, "consequents"]
mask_full = baskets_mask_for_itemset(frozenset(set(ant)|set(cons)))
tx_ids = ohe.index[mask_full]
tx_meta_pdf[tx_meta_pdf["tx_id"].isin(tx_ids)][["tx_id"] + group_cols]

OUTRA ALTERNATIVA COM ANTECEDENTE GRUPOS E TIPO SERVI√áO DESC

In [0]:
# --- 1) Imports ---
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
from mlxtend.frequent_patterns import apriori, association_rules

# --- 2) Par√¢metros ---
group_cols   = ["modelo", "combustivel", "age_interval", "classe_potencia"]  # ANTECEDENTE (sempre estes 4)
target_col   = "tipo_de_servico"                                             # CONSEQUENTE (1 servi√ßo)
min_support  = 0.005    # ~0.5% (ajusta conforme volume)
min_conf     = 0.60
lift_min     = 1.0

# --- 3) Ler em Spark apenas colunas necess√°rias ---
df_s = (spark.table("workspace.sc_gold.join_viatura_historico")
            .select(*(group_cols + [target_col]))
            .na.drop(subset=group_cols + [target_col])
            .withColumn(target_col, F.col(target_col).cast("string")))

# (Opcional) normalizar combustivel/modelo aqui, se quiseres agrupar melhor

# --- 4) Converter para Pandas (apenas as colunas necess√°rias) ---
pdf = df_s.toPandas()

# --- 5) Construir transa√ß√µes a n√≠vel de LINHA (cada registo = 1 cesto) ---
#   Antecedente: 4 features codificadas como "chave=valor"
#   Consequente: 1 servi√ßo "svc:..."
def build_items_row(row):
    feats = [f"{col}={str(row[col])}" for col in group_cols]
    svc   = [f"svc:{str(row[target_col])}"]
    return feats + svc

pdf["items_all"] = pdf.apply(build_items_row, axis=1)
transactions_list = pdf["items_all"].tolist()
num_tx = len(transactions_list)

# --- 6) One-hot encode (bool) ---
all_items = sorted({it for ts in transactions_list for it in ts})
# cuidado com mem√≥ria: isto cria (num_tx x n_itens). Se ficar grande, podemos amostrar.
ohe = pd.DataFrame(False, index=range(num_tx), columns=all_items)
for i, items in enumerate(transactions_list):
    for it in items:
        ohe.iat[i, ohe.columns.get_loc(it)] = True

# --- 7) Apriori + Regras ---
freq  = apriori(ohe, min_support=min_support, use_colnames=True, max_len=5)  # at√© 5 itens (4 feats + 1 svc)
rules = association_rules(freq, metric="confidence", min_threshold=min_conf)

# Normalizar tipos
rules["antecedents"] = rules["antecedents"].apply(lambda x: frozenset(list(x)))
rules["consequents"] = rules["consequents"].apply(lambda x: frozenset(list(x)))

# --- 8) Filtros: antecedente = EXACTAMENTE 4 features (1 de cada), consequente = 1 servi√ßo ---
def is_full_feature_set(s):
    if len(s) != 4:
        return False
    prefixes = {"modelo=", "combustivel=", "age_interval=", "classe_potencia="}
    return all(any(str(it).startswith(p) for p in prefixes) for it in s) and \
           {next(p for p in prefixes if str(it).startswith(p)) for it in s} == prefixes

def only_service(s):
    return len(s) == 1 and all(str(it).startswith("svc:") for it in s)

mask_ante = rules["antecedents"].apply(is_full_feature_set).astype(bool)
mask_cons = rules["consequents"].apply(only_service).astype(bool)

rules = rules[mask_ante & mask_cons].copy()
rules = rules[rules["lift"] >= lift_min].copy()

# Strings leg√≠veis
rules["ante_str"] = rules["antecedents"].apply(lambda x: ", ".join(sorted(list(x))))
rules["cons_str"] = rules["consequents"].apply(lambda x: ", ".join(sorted(list(x))))

# Ordenar
rules = rules.sort_values(["lift","confidence","support"], ascending=False).reset_index(drop=True)

print("Transa√ß√µes:", num_tx)
print("Itemsets frequentes:", len(freq))
print("Regras (4 features ‚Üí 1 servi√ßo):", len(rules))
display(rules[["ante_str","cons_str","support","confidence","lift"]].head(30))


In [0]:
# --- 1) Imports ---
import pandas as pd
import numpy as np
from ast import literal_eval
from pyspark.sql import functions as F
from mlxtend.frequent_patterns import apriori, association_rules

# --- 2) Par√¢metros ---
group_cols = ["modelo", "combustivel", "age_interval", "classe_potencia"]
target_col = "tipo_de_servico"
min_support = 0.05          # ajusta conforme o n¬∫ de cestos
min_confidence = 0.60       # confian√ßa m√≠nima
lift_min = 1.0              # >=1 => associa√ß√£o positiva
max_examples = 10           # n¬∫ de grupos a mostrar por regra (amostra)

# --- 3) Ler e preparar em Spark ---
df_s = (
    spark.table("workspace.sc_gold.join_viatura_historico")
         .select(*(group_cols + [target_col]))
         .na.drop(subset=group_cols + [target_col])
         .withColumn(target_col, F.col(target_col).cast("string"))
)

# --- 4) Construir transa√ß√µes (um basket por combina√ß√£o de atributos) ---
transactions_s = (
    df_s.groupBy(*[F.col(c) for c in group_cols])
        .agg(F.array_sort(F.array_distinct(F.collect_list(F.col(target_col)))).alias("svc_items"))
        .withColumn("tx_id", F.monotonically_increasing_id())
)

# Metadados dos cestos (grupos)
tx_meta_pdf = (
    transactions_s.select("tx_id", *group_cols, "svc_items")
                  .orderBy("tx_id")
                  .toPandas()
)

# --- 5) Criar itens do basket: features (=antecedente) + servi√ßos (=consequente)
def build_items_row(row):
    # atributos (antecedente)
    feats = [f"{col}={str(row[col])}" for col in group_cols]

    # servi√ßos (consequente) ‚Äî normalizar para lista de strings (sem usar "or []")
    val = row.get("svc_items", None)

    if isinstance(val, (list, tuple, set)):
        items = list(val)
    elif isinstance(val, np.ndarray):
        items = val.tolist()
    elif isinstance(val, str) and val.strip().startswith("[") and val.strip().endswith("]"):
        # string com formato de lista
        try:
            parsed = literal_eval(val)
            items = list(parsed) if isinstance(parsed, (list, tuple, set)) else [val]
        except Exception:
            items = [val]
    elif val is None or (isinstance(val, float) and pd.isna(val)):
        items = []
    else:
        items = [val]

    svcs = [f"svc:{str(s)}" for s in items if pd.notna(s) and str(s).strip() != ""]
    return feats + svcs

tx_meta_pdf["items_all"] = tx_meta_pdf.apply(build_items_row, axis=1)
transactions_list = tx_meta_pdf["items_all"].tolist()
num_tx = len(transactions_list)

# --- 6) One-hot encode (bool) ---
all_items = sorted({it for ts in transactions_list for it in ts})
ohe = pd.DataFrame(False, index=tx_meta_pdf["tx_id"], columns=all_items)
for tx_id, items in zip(tx_meta_pdf["tx_id"], transactions_list):
    for it in items:
        if it in ohe.columns:
            ohe.at[tx_id, it] = True

# --- 7) Apriori + Regras ---
freq = apriori(ohe, min_support=min_support, use_colnames=True)
rules = association_rules(freq, metric="confidence", min_threshold=min_confidence)

# Normalizar tipos (garante frozenset)
rules["antecedents"] = rules["antecedents"].apply(lambda x: frozenset(list(x)))
rules["consequents"] = rules["consequents"].apply(lambda x: frozenset(list(x)))

# 7.1 Consequente unit√°rio
rules = rules[rules["consequents"].apply(lambda s: len(s) == 1)].copy()

# 7.2 Filtrar formato (antecedente s√≥ features; consequente s√≥ servi√ßos "svc:")
def only_features(s):
    return all(
        it.startswith("modelo=") or
        it.startswith("combustivel=") or
        it.startswith("age_interval=") or
        it.startswith("classe_potencia=")
        for it in s
    )

def only_service(s):
    return all(it.startswith("svc:") for it in s)

mask_ante = rules["antecedents"].apply(only_features).astype(bool)
mask_cons = rules["consequents"].apply(only_service).astype(bool)
rules = rules[mask_ante & mask_cons].copy()

# 7.3 Filtro de qualidade opcional
rules = rules[rules["lift"] >= lift_min].copy()

# 7.4 Strings leg√≠veis
rules["ante_str"] = rules["antecedents"].apply(lambda x: ", ".join(sorted(list(x))))
rules["cons_str"] = rules["consequents"].apply(lambda x: ", ".join(sorted(list(x))))

# Ordenar por lift/conf/sup
rules = rules.sort_values(["lift", "confidence", "support"], ascending=False).reset_index(drop=True)

print("Transa√ß√µes:", num_tx)
print("Itemsets frequentes:", len(freq))
print("Regras (features ‚Üí servi√ßo):", len(rules))
display(rules[["ante_str","cons_str","support","confidence","lift"]].head(20))  # Databricks-friendly

# --- 8) (Opcional) Mapear regras aos grupos concretos onde ocorrem (exemplos) ---
def baskets_mask_for_itemset(itemset_frozenset):
    cols = list(itemset_frozenset)
    if not cols:
        return pd.Series(False, index=ohe.index)
    missing = [c for c in cols if c not in ohe.columns]
    if missing:
        return pd.Series(False, index=ohe.index)
    return ohe[cols].all(axis=1)

support_counts, ante_counts, examples = [], [], []
for _, r in rules.iterrows():
    ant = r["antecedents"]
    cons = r["consequents"]
    both = frozenset(set(ant) | set(cons))

    m_support = baskets_mask_for_itemset(both)     # onde antecedente+consequente ocorrem
    m_ante    = baskets_mask_for_itemset(ant)      # onde antecedente ocorre

    support_counts.append(int(m_support.sum()))
    ante_counts.append(int(m_ante.sum()))

    sample_ids = list(ohe.index[m_support][:max_examples])
    sample_rows = (
        tx_meta_pdf[tx_meta_pdf["tx_id"].isin(sample_ids)][["tx_id"] + group_cols]
        .to_dict(orient="records")
    )
    examples.append(sample_rows)

rules["group_count_support"] = support_counts
rules["group_count_antecedent"] = ante_counts
rules["groups_example_support"] = examples

# Ver top 10 com exemplos
cols_to_show = ["ante_str","cons_str","support","confidence","lift",
                "group_count_support","group_count_antecedent","groups_example_support"]
display(rules[cols_to_show].head(10))


In [0]:
import matplotlib.pyplot as plt

# Converter frozenset -> string para facilitar o filtro
rules["ante_str"] = rules["antecedents"].apply(lambda x: ", ".join(sorted(list(x))))
rules["cons_str"] = rules["consequents"].apply(lambda x: ", ".join(sorted(list(x))))

# Definir modelos que te interessam (podes alterar)
model_keywords = ["TUCSON","Kauai Ev" "I20", "I30", "KONA", "KAUAI", "IX35", "IONIQ", "SANTA FE"]

# Filtrar regras onde o ANTECEDENTE cont√©m um modelo e o CONSEQUENTE √© um servi√ßo
rules_filtered = rules[
    rules["ante_str"].str.contains("|".join(model_keywords), case=False)
    & ~rules["cons_str"].str.contains("|".join(model_keywords), case=False)
]

# Top 10 por lift
top_rules = rules_filtered.nlargest(10, "lift").copy()

# Criar label "ante ‚Üí cons"
top_rules["rule_label"] = top_rules["ante_str"] + "  ‚Üí  " + top_rules["cons_str"]

# Plot
plt.figure(figsize=(10, 6))
plt.barh(top_rules["rule_label"], top_rules["lift"])
plt.xlabel("Lift (for√ßa da associa√ß√£o)")
plt.title("Top 10 Regras Modelo ‚Üí Tipo de Servi√ßo")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [0]:
plt.figure(figsize=(8,6))
plt.scatter(rules_filtered['support'], rules_filtered['confidence'], alpha=0.7)
plt.xlabel("Support (frequency)")
plt.ylabel("Confidence (reliability)")
plt.title("Model ‚Üí Service Rule Strength")
plt.grid(True)
plt.show()


In [0]:
import seaborn as sns

# Create a pivot table of lift
pivot = rules_filtered.pivot_table(index='antecedents', columns='consequents', values='lift')

plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title("Lift by Model and Service Type")
plt.ylabel("Model")
plt.xlabel("Service Type")
plt.show()

FP-GROWTH

In [0]:
# --- 0) (se faltar) ---
# %pip install mlxtend

import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules

# Parto do dataframe pandas `pdf` que cri√°mos no Apriori ‚Äúpor linha‚Äù:
# pdf tem colunas: modelo, combustivel, age_interval, classe_potencia, tipo_de_servico

group_cols = ["modelo","combustivel","age_interval","classe_potencia"]
target_col = "tipo_de_servico"

# 1) Transa√ß√µes por linha: 4 features + 1 servi√ßo
def build_items_row(row):
    feats = [f"{c}={row[c]}" for c in group_cols]
    svc   = [f"svc:{row[target_col]}"]
    return feats + svc

pdf = pdf.dropna(subset=group_cols + [target_col]).copy()
pdf["items_all"] = pdf.apply(build_items_row, axis=1)
transactions_list = pdf["items_all"].tolist()

# 2) One-hot encode booleano
all_items = sorted({it for ts in transactions_list for it in ts})
ohe = pd.DataFrame(False, index=range(len(transactions_list)), columns=all_items)
for i, items in enumerate(transactions_list):
    for it in items:
        ohe.iat[i, ohe.columns.get_loc(it)] = True

# 3) FP-Growth (mlxtend) + regras
min_support = 0.005   # ~0.5% (ajusta)
min_conf    = 0.60
min_lift    = 1.0

freq = fpgrowth(ohe, min_support=min_support, use_colnames=True, max_len=5)   # at√© 5 itens (4 feats + 1 svc)
rules = association_rules(freq, metric="confidence", min_threshold=min_conf)

# 4) Normalizar tipos
rules["antecedents"] = rules["antecedents"].apply(lambda x: frozenset(list(x)))
rules["consequents"] = rules["consequents"].apply(lambda x: frozenset(list(x)))

# 5) Filtros: antecedente = EXACTAMENTE 4 itens, 1 de cada prefixo; consequente = 1 servi√ßo (svc:)
def is_full_feature_set(s):
    if len(s) != 4:
        return False
    prefixes = {"modelo=", "combustivel=", "age_interval=", "classe_potencia="}
    hits = {next((p for p in prefixes if str(it).startswith(p)), None) for it in s}
    return None not in hits and hits == prefixes

def only_service(s):
    return len(s) == 1 and all(str(it).startswith("svc:") for it in s)

mask_ante = rules["antecedents"].apply(is_full_feature_set)
mask_cons = rules["consequents"].apply(only_service)
rules = rules[mask_ante & mask_cons].copy()

rules = rules[rules["lift"] >= min_lift].copy()

# 6) Human readable
rules["ante_str"] = rules["antecedents"].apply(lambda s: ", ".join(sorted(list(s))))
rules["cons_str"] = rules["consequents"].apply(lambda s: ", ".join(sorted(list(s))))
rules = rules.sort_values(["lift","confidence","support"], ascending=False).reset_index(drop=True)

print("Regras (4 features ‚Üí 1 servi√ßo):", len(rules))
rules.head(20)[["ante_str","cons_str","support","confidence","lift"]]


In [0]:
import re
import matplotlib.pyplot as plt

def plot_topN_rules(rules_df, metric="lift", topn=10, title=None):
    """Desenha Top-N regras usando matplotlib, ordenadas por 'metric'."""
    df = rules_df.copy()

    # Garantir colunas leg√≠veis
    if "ante_str" not in df:
        df["ante_str"] = df["antecedents"].apply(
            lambda s: ", ".join(sorted(list(s))) if not isinstance(s, str) else s
        )
    if "cons_str" not in df:
        df["cons_str"] = df["consequents"].apply(
            lambda s: ", ".join(sorted(list(s))) if not isinstance(s, str) else s
        )

    # Tirar prefixo 'svc:' do consequente, s√≥ para ficar mais bonito
    df["cons_str"] = df["cons_str"].str.replace(r"^svc:\s*", "", regex=True)

    # Preparar Top-N
    df = df.sort_values(metric, ascending=False).head(topn).copy()
    df["label"] = df["ante_str"] + "  ‚Üí  " + df["cons_str"]

    # Plot
    plt.figure(figsize=(12, 6))
    plt.barh(df["label"], df[metric])
    plt.xlabel(metric.title())
    plt.title(title or f"Top {topn} regras por {metric}")
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

# Exemplo de uso:
# Top-10 por lift
plot_topN_rules(rules, metric="lift", topn=10, title="Top 10 regras (features ‚Üí servi√ßo) por Lift")

# Top-10 por confidence
plot_topN_rules(rules, metric="confidence", topn=10, title="Top 10 regras por Confidence")
