In [0]:
%sql

SELECT * FROM sc_gold.historico_de_servicos

In [0]:
%sql
SELECT * FROM sc_gold.viaturas

In [0]:
%sql
DROP TABLE IF EXISTS sc_gold.historico_de_servicos_2;

CREATE TABLE sc_gold.historico_de_servicos_2 AS
SELECT numero_do_servico_pos_venda,data_de_fecho,data_de_abertura,canal_de_venda,data_servico_pos_venda,kms,total_mao_de_obra,descricao_servico_pos_venda,nome_concessao,viatura,cliente,ordem_reparacao,tipo_de_servico,origem_registo,id,pedido_do_cliente
FROM sc_gold.historico_de_servicos;


In [0]:
%sql
SELECT * FROM sc_gold.historico_de_servicos_2

In [0]:
%sql

DROP TABLE IF EXISTS sc_gold.viaturas_2;

CREATE TABLE sc_gold.viaturas_2 AS
SELECT designacao_comercial,modelo,motorizacao,versao,data_de_matricula,cilindrada__cm3_,potencia_maxima__kw_,combustivel,gwms_engine,production_date
FROM sc_gold.viaturas;

In [0]:
%sql
SELECT * FROM sc_gold.viaturas_2

In [0]:
from pyspark.sql import functions as F

table_name = "sc_gold.viaturas_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window as W
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType, DecimalType

In [0]:
#passo usado para remover linhas (excluir linhas vazias na coluna modelo -486 linhas)
table_name = "sc_gold.viaturas_2"
df0 = spark.table(table_name)


# Aplicar o filtro (excluir linhas vazias na coluna modelo -486 linhas)
df = df0.filter(
    (F.col("modelo").isNotNull())
    & (F.trim(F.col("modelo")) != "")
    & (F.lower(F.trim(F.col("modelo"))) != "null")
)


# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)


In [0]:
#descobrir o numero de dias em media de diferen√ßa entre a data de produ√ß√£o e a data da matricula

# Carregar a tabela
df = spark.table("sc_gold.viaturas_2")

# Converter para DATE (com v√°rios formatos tolerados, se necess√°rio)
df = df.withColumn(
    "data_de_matricula_dt",
    F.coalesce(
        F.expr("try_to_date(data_de_matricula, 'dd-MM-yyyy')"),
        F.expr("try_to_date(data_de_matricula, 'yyyy-MM-dd')"),
        F.expr("try_to_date(data_de_matricula, 'dd/MM/yyyy')")
    )
).withColumn(
    "production_date_dt",
    F.coalesce(
        F.expr("try_to_date(production_date, 'dd-MM-yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy-MM-dd')"),
        F.expr("try_to_date(production_date, 'dd/MM/yyyy')")
    )
)

# Remover linhas com valores nulos em qualquer das datas
df_valid = df.filter(F.col("data_de_matricula_dt").isNotNull() & F.col("production_date_dt").isNotNull())

# Calcular diferen√ßa em dias
df_valid = df_valid.withColumn("diff_days", 
                               F.datediff(F.col("data_de_matricula_dt"), F.col("production_date_dt")))

# Somat√≥rio e m√©dia
agg = df_valid.agg(
    F.sum("diff_days").alias("soma_dias"),
    F.count("diff_days").alias("n_linhas"),
    F.avg("diff_days").alias("media_dias")
).collect()[0]

print("üîπ Soma total dos dias:", agg["soma_dias"])
print("üîπ N√∫mero de linhas usadas:", agg["n_linhas"])
print("üîπ M√©dia de dias:", round(agg["media_dias"], 2))


In [0]:
#substituir data de produ√ß√£o e a data da matricula +/-131 com base no calculo anterior

df = spark.table("sc_gold.viaturas_2")

# Tenta converter em v√°rios formatos comuns
df = df.withColumn(
    "data_de_matricula",
    F.coalesce(
        F.expr("try_to_date(data_de_matricula, 'dd-MM-yyyy')"),
        F.expr("try_to_date(data_de_matricula, 'yyyy-MM-dd')"),
        F.expr("try_to_date(data_de_matricula, 'dd/MM/yyyy')"),
        F.expr("try_to_date(data_de_matricula, 'yyyy/MM/dd')")
    )
)

df = df.withColumn(
    "production_date",
    F.coalesce(
        F.expr("try_to_date(production_date, 'dd-MM-yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy-MM-dd')"),
        F.expr("try_to_date(production_date, 'dd/MM/yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy/MM/dd')")
    )
)

# Aplicar as regras dos ¬±131 dias
df = df.withColumn(
    "production_date",
    F.when(F.col("production_date").isNull() & F.col("data_de_matricula").isNotNull(),
           F.date_sub(F.col("data_de_matricula"), 131))
     .otherwise(F.col("production_date"))
).withColumn(
    "data_de_matricula",
    F.when(F.col("data_de_matricula").isNull() & F.col("production_date").isNotNull(),
           F.date_add(F.col("production_date"), 131))
     .otherwise(F.col("data_de_matricula"))
)

# Reescrever a tabela (permitindo schema overwrite se necess√°rio)
(df.write
   .format("delta")
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
#criar nova coluna que √© o ano de produ√ß√£o do carro
# Carregar tabela
df = spark.table("sc_gold.viaturas_2")

# Converter production_date para DATE (caso ainda seja string)
df = df.withColumn(
    "production_date_dt",
    F.coalesce(
        F.expr("try_to_date(production_date, 'dd-MM-yyyy')"),
        F.expr("try_to_date(production_date, 'yyyy-MM-dd')"),
     F.expr("try_to_date(production_date, 'dd/MM/yyyy')")
    )
)

# Extrair o ano de produ√ß√£o
df = df.withColumn("production_year", F.year("production_date_dt"))

# Calcular idade em anos at√© a data de hoje
df = df.withColumn(
    "age_year",
    F.floor(F.datediff(F.current_date(), F.col("production_date_dt")) / 365.25)
)

# Regravar a tabela com as novas colunas
(df.write
   .format("delta")
  .mode("overwrite")
  .option("overwriteSchema", "true")  # garante que aceita as novas colunas
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# calcular a m√©dia de cilindrada (cilindrada__cm3_) por (gwms_engine + modelo + motoriza√ß√£o)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Janela por atributos do grupo
w = W.partitionBy("gwms_engine", "motorizacao", "modelo")

# 3) m√©dia global (fallback) ‚Äî j√° arredondada para inteiro
global_avg = df.select(F.avg("cilindrada__cm3_").alias("g")).first()["g"]
if global_avg is not None:
    global_avg = int(round(global_avg))

# 4) Preencher nulos com a m√©dia do grupo arredondada (ou fallback global)
df_filled = (
    df
    .withColumn("avg_grupo", F.avg("cilindrada__cm3_").over(w))
    .withColumn(
        "cilindrada__cm3_",
        F.when(
            F.col("cilindrada__cm3_").isNull(),
            F.coalesce(F.round(F.col("avg_grupo")).cast(IntegerType()), F.lit(global_avg))
        ).otherwise(F.col("cilindrada__cm3_").cast(IntegerType()))
    )
    .drop("avg_grupo")
)


# 5) Escrever o RESULTADO correto
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# calcular a m√©dia de potencia (potencia_maxima__kw_) por (gwms_engine + modelo + motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Limpeza e cast:
#    - troca v√≠rgula decimal por ponto
#    - remove qualquer caractere n√£o num√©rico (p.ex. ' kW', espa√ßos, etc.)
pot_clean = F.regexp_replace(F.col("potencia_maxima__kw_"), ",", ".")
pot_clean = F.regexp_replace(pot_clean, r"[^0-9.]", "")
df = df.withColumn("potencia_maxima__kw_", pot_clean.cast(DoubleType()))

# 3) M√©dia global (fallback), arredondada a 1 casa
global_avg = df.select(F.avg("potencia_maxima__kw_").alias("g")).first()["g"]
global_avg_1d = round(global_avg, 1) if global_avg is not None else None

# 4) M√©dia por grupo
keys = ["gwms_engine", "motorizacao", "modelo"]
avg_by_group = (
    df.groupBy(*keys)
      .agg(F.avg("potencia_maxima__kw_").alias("avg_grp"))
)

# 5) Preencher apenas nulos com a m√©dia do grupo (1 casa decimal);
#    se o grupo for todo nulo, usa m√©dia global
df_filled = (
    df.join(avg_by_group, on=keys, how="left")
      .withColumn(
          "potencia_maxima__kw_",
          F.when(
              F.col("potencia_maxima__kw_").isNull(),
              F.coalesce(F.round(F.col("avg_grp"), 1), F.lit(global_avg_1d))
          ).otherwise(F.col("potencia_maxima__kw_"))
      )
      .drop("avg_grp")
)

# (Opcional) Se quiser NORMALIZAR toda a coluna para 1 casa decimal, inclusive n√£o nulos:
# df_filled = df_filled.withColumn("potencia_maxima__kw_", F.round(F.col("potencia_maxima__kw_"), 1))

# (Opcional) Fixar o tipo para Decimal(10,1) no schema (em vez de double):
# df_filled = df_filled.withColumn("potencia_maxima__kw_", F.col("potencia_maxima__kw_").cast(DecimalType(10,1)))

# 6) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# Preencher 'motorizacao' com a MODA por grupo (gwms_engine, modelo, potencia_maxima__kw_, combustivel)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Pot√™ncia: v√≠rgula -> ponto, remover ru√≠do e cast para double (porque √© chave do grupo)
pot = F.regexp_replace(F.col("potencia_maxima__kw_"), ",", ".")
pot = F.regexp_replace(pot, r"[^0-9.]", "")
df = df.withColumn("potencia_maxima__kw_", pot.cast(DoubleType()))

# 3) Higienizar texto relevante
for col in ["motorizacao", "combustivel", "gwms_engine", "modelo"]:
    df = df.withColumn(col, F.trim(F.col(col)))
df = df.withColumn("motorizacao", F.when(F.col("motorizacao") == "", None).otherwise(F.col("motorizacao")))
df = df.withColumn("combustivel", F.when(F.col("combustivel") == "", None).otherwise(F.col("combustivel")))
df = df.withColumn("gwms_engine", F.when(F.col("gwms_engine") == "", None).otherwise(F.col("gwms_engine")))

# 4) Chaves do grupo
keys_mot = ["gwms_engine", "modelo", "potencia_maxima__kw_", "combustivel"]

# 5) Moda de 'motorizacao' por grupo (desempate alfab√©tico)
counts_mot = (
    df.filter(F.col("motorizacao").isNotNull())
      .groupBy(*keys_mot, "motorizacao")
      .agg(F.count(F.lit(1)).alias("cnt"))
)
w_mot = W.partitionBy(*keys_mot).orderBy(F.col("cnt").desc(), F.col("motorizacao").asc())
mode_motorizacao = (
    counts_mot.withColumn("rn", F.row_number().over(w_mot))
              .filter(F.col("rn") == 1)
              .select(*keys_mot, F.col("motorizacao").alias("mode_motorizacao"))
)

# 6) Moda global de 'motorizacao' (fallback opcional)
row_mot = (
    df.filter(F.col("motorizacao").isNotNull())
      .groupBy("motorizacao").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("motorizacao").asc())
      .limit(1).first()
)
global_mode_mot = row_mot["motorizacao"] if row_mot else None

# 7) Preencher APENAS nulos de 'motorizacao' com a moda do grupo (ou global)
df_filled = (
    df.join(mode_motorizacao, on=keys_mot, how="left")
      .withColumn(
          "motorizacao",
          F.when(F.col("motorizacao").isNull(),
                 F.coalesce(F.col("mode_motorizacao"), F.lit(global_mode_mot)))
           .otherwise(F.col("motorizacao"))
      )
      .drop("mode_motorizacao")
)

# 8) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
# Preencher 'gwms_engine' pela MODA por grupo (cilindrada__cm3_, potencia_maxima__kw_, modelo, motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Higienizar num√©ricos (v√≠rgula -> ponto; remover ru√≠do) e fazer cast
cil_clean = F.regexp_replace(F.col("cilindrada__cm3_"), ",", ".")
cil_clean = F.regexp_replace(cil_clean, r"[^0-9.]", "")
df = df.withColumn("cilindrada__cm3_", cil_clean.cast(DoubleType()))

pot_clean = F.regexp_replace(F.col("potencia_maxima__kw_"), ",", ".")
pot_clean = F.regexp_replace(pot_clean, r"[^0-9.]", "")
df = df.withColumn("potencia_maxima__kw_", pot_clean.cast(DoubleType()))

# 3) Higienizar texto: trim e strings vazias -> NULL
df = df.withColumn("gwms_engine", F.when(F.trim(F.col("gwms_engine")) == "", None)
                                  .otherwise(F.trim(F.col("gwms_engine"))))
df = df.withColumn("modelo", F.trim(F.col("modelo")))
# "motorizacao" sem acento ‚Äî confirme o nome exato na tabela
df = df.withColumn("motorizacao", F.when(F.trim(F.col("motorizacao")) == "", None)
                                   .otherwise(F.trim(F.col("motorizacao"))))

# 4) Chaves do grupo
keys = ["cilindrada__cm3_", "potencia_maxima__kw_", "modelo", "motorizacao"]

# 5) Calcular a MODA de gwms_engine por grupo
counts = (
    df.filter(F.col("gwms_engine").isNotNull())
      .groupBy(*keys, "gwms_engine")
      .agg(F.count(F.lit(1)).alias("cnt"))
)

w = W.partitionBy(*keys).orderBy(F.col("cnt").desc(), F.col("gwms_engine").asc())

mode_by_group = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select(*keys, F.col("gwms_engine").alias("mode_gwms_engine"))
)

# 6) (Opcional) Moda GLOBAL como fallback se o grupo n√£o tiver ocorr√™ncias v√°lidas
row_global = (
    df.filter(F.col("gwms_engine").isNotNull())
      .groupBy("gwms_engine").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("gwms_engine").asc())
      .limit(1).first()
)
global_mode = row_global["gwms_engine"] if row_global else None

# 7) Preencher APENAS nulos com a moda do grupo (ou moda global se necess√°rio)
df_filled = (
    df.join(mode_by_group, on=keys, how="left")
      .withColumn(
          "gwms_engine",
          F.when(
              F.col("gwms_engine").isNull(),
              F.coalesce(F.col("mode_gwms_engine"), F.lit(global_mode))
          ).otherwise(F.col("gwms_engine"))
      )
      .drop("mode_gwms_engine")
)

# 8) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# MODA de combustivel por (gwms_engine, cilindrada__cm3_, modelo, motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Higienizar colunas num√©ricas (garantir double)
cil = F.regexp_replace(F.col("cilindrada__cm3_"), ",", ".")
cil = F.regexp_replace(cil, r"[^0-9.]", "")
df = df.withColumn("cilindrada__cm3_", cil.cast(DoubleType()))

# 3) Higienizar colunas de texto
df = df.withColumn("gwms_engine", F.when(F.length(F.trim("gwms_engine")) == 0, None)
                                   .otherwise(F.trim(F.col("gwms_engine")).cast("string")))
df = df.withColumn("modelo", F.trim(F.col("modelo")).cast("string"))
df = df.withColumn("motorizacao", F.when(F.length(F.trim("motorizacao")) == 0, None)
                                   .otherwise(F.trim(F.col("motorizacao")).cast("string")))
df = df.withColumn("combustivel", F.when(F.length(F.trim("combustivel")) == 0, None)
                                   .otherwise(F.trim(F.col("combustivel")).cast("string")))

# 4) Chave do grupo
keys = ["gwms_engine", "cilindrada__cm3_", "modelo", "motorizacao"]

# 5) Moda de combustivel por grupo
counts = (
    df.filter(F.col("combustivel").isNotNull())
      .groupBy(*keys, "combustivel")
      .agg(F.count(F.lit(1)).alias("cnt"))
)

w = W.partitionBy(*keys).orderBy(F.col("cnt").desc(), F.col("combustivel").asc())

mode_by_group = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select(*keys, F.col("combustivel").alias("mode_combustivel"))
)

# 6) Moda global como fallback
row_global = (
    df.filter(F.col("combustivel").isNotNull())
      .groupBy("combustivel").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("combustivel").asc())
      .limit(1).first()
)
global_mode = row_global["combustivel"] if row_global else None

# 7) Preencher apenas nulos
df_filled = (
    df.join(mode_by_group, on=keys, how="left")
      .withColumn(
          "combustivel",
          F.when(
              F.col("combustivel").isNull(),
              F.coalesce(F.col("mode_combustivel"), F.lit(global_mode))
          ).otherwise(F.col("combustivel"))
      )
      .drop("mode_combustivel")
)

# 8) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
# MODA de designacao_comercial por (gwms_engine, modelo, motorizacao)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Higienizar texto (trim + strings vazias -> NULL)
for col in ["designacao_comercial", "modelo", "gwms_engine", "motorizacao"]:
    df = df.withColumn(col, F.trim(F.col(col)))
    df = df.withColumn(col, F.when(F.col(col) == "", None).otherwise(F.col(col)))

# 3) Chaves do grupo
keys = ["gwms_engine", "modelo", "motorizacao"]

# 4) Moda por grupo
counts = (
    df.filter(F.col("designacao_comercial").isNotNull())
      .groupBy(*keys, "designacao_comercial")
      .agg(F.count(F.lit(1)).alias("cnt"))
)

w = W.partitionBy(*keys).orderBy(F.col("cnt").desc(), F.col("designacao_comercial").asc())

mode_by_group = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select(*keys, F.col("designacao_comercial").alias("mode_designacao_comercial"))
)

# 5) Moda global como fallback
row_global = (
    df.filter(F.col("designacao_comercial").isNotNull())
      .groupBy("designacao_comercial").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("designacao_comercial").asc())
      .limit(1).first()
)
global_mode = row_global["designacao_comercial"] if row_global else None

# 6) Preencher apenas nulos
df_filled = (
    df.join(mode_by_group, on=keys, how="left")
      .withColumn(
          "designacao_comercial",
          F.when(
              F.col("designacao_comercial").isNull(),
              F.coalesce(F.col("mode_designacao_comercial"), F.lit(global_mode))
          ).otherwise(F.col("designacao_comercial"))
      )
      .drop("mode_designacao_comercial")
)

# 7) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))

In [0]:
# Preencher 'versao' com a MODA por grupo (modelo, gwms_engine, motorizacao, age_year)
# 1) Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# 2) Higienizar texto (trim e strings vazias -> NULL)
for col in ["versao", "modelo", "gwms_engine", "motorizacao", "age_year"]:
    df = df.withColumn(col, F.trim(F.col(col)))
    df = df.withColumn(col, F.when(F.col(col) == "", None).otherwise(F.col(col)))

# 3) Definir chaves do grupo
keys = ["modelo", "gwms_engine", "motorizacao", "age_year"]

# 4) Calcular a moda de 'versao' por grupo
counts = (
    df.filter(F.col("versao").isNotNull())
      .groupBy(*keys, "versao")
      .agg(F.count(F.lit(1)).alias("cnt"))
)

w = W.partitionBy(*keys).orderBy(F.col("cnt").desc(), F.col("versao").asc())

mode_by_group = (
    counts.withColumn("rn", F.row_number().over(w))
          .filter(F.col("rn") == 1)
          .select(*keys, F.col("versao").alias("mode_versao"))
)

# 5) Moda global como fallback
row_global = (
    df.filter(F.col("versao").isNotNull())
      .groupBy("versao").agg(F.count(F.lit(1)).alias("cnt"))
      .orderBy(F.col("cnt").desc(), F.col("versao").asc())
      .limit(1).first()
)
global_mode = row_global["versao"] if row_global else None

# 6) Preencher apenas nulos de 'versao'
df_filled = (
    df.join(mode_by_group, on=keys, how="left")
      .withColumn(
          "versao",
          F.when(
              F.col("versao").isNull(),
              F.coalesce(F.col("mode_versao"), F.lit(global_mode))
          ).otherwise(F.col("versao"))
      )
      .drop("mode_versao")
)

# 7) Escrever resultado
(df_filled.write
   .mode("overwrite")
   .option("overwriteSchema", "true")
   .saveAsTable("sc_gold.viaturas_2"))


In [0]:
# Passo usado para remover nulos das datas (confirmar) e remover coluna duplicada 'data_de_matricula_dt'
#  Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# Remover linhas onde production_date √© NULL
df_clean = df.filter(F.col("production_date").isNotNull())

# Remover a coluna 'production_date_dt'
df_clean = df.drop("production_date_dt")

# (opcional) sobrescrever a tabela com o dataset limpo
(df_clean.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("sc_gold.viaturas_2"))

In [0]:
from pyspark.sql.functions import corr
df.select(corr("production_year", "age_year").alias("corr")).show()

In [0]:
# Passo usado para remover coluna "production_year" vai distorcer o clustering porque est√°s a ‚Äúcontar duas vezes‚Äù o mesmo fator "age_year"
#  Ler a tabela
df = spark.table("sc_gold.viaturas_2")

# Remover a coluna 'production_date_dt'
df_clean = df.drop("production_year")

# (opcional) sobrescrever a tabela com o dataset limpo
(df_clean.write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("sc_gold.viaturas_2"))

In [0]:
#ANTES DE CORRER VER SE FAZ SENTIDO CRIAR UAM COLUNA ANO DATA_DA_MATRICULA E REMOVER ESSAS DUAS COLUNAS DE DATA PARA K-MEANS
from pyspark.sql import functions as F

df = (
    df.withColumn("production_year", F.year("production_date"))
      .withColumn("matricula_year", F.year("data_de_matricula"))
      .withColumn("delay_matricula", F.months_between("data_de_matricula", "production_date") / 12.0)
)

In [0]:
from pyspark.sql import functions as F

table_name = "sc_gold.viaturas_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)

prepara√ß√£o para K-MEANS

In [0]:
table_name = "sc_gold.viaturas_2"

# Load the table
df = spark.table(table_name)

df.printSchema()
df.show(5)


In [0]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

categorical_cols = ["motorizacao", "combustivel", "gwms_engine", "designacao_comercial", "versao","modelo"]
indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in categorical_cols]
encoders = [OneHotEncoder(inputCol=c+"_idx", outputCol=c+"_vec") for c in categorical_cols]


In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

numeric_cols = ["cilindrada__cm3_", "potencia_maxima__kw_", "production_year", "age_year"] 

assembler_num = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_features")
scaler = StandardScaler(inputCol="numeric_features", outputCol="scaled_numeric", withMean=True, withStd=True)
