In [0]:
%sql

SELECT * FROM sc_gold.historico_de_servicos

In [0]:
%sql
SELECT * FROM sc_gold.viaturas

In [0]:
%sql
DROP TABLE IF EXISTS sc_gold.historico_de_servicos_2;

CREATE TABLE sc_gold.historico_de_servicos_2 AS
SELECT numero_do_servico_pos_venda,data_de_fecho,data_de_abertura,canal_de_venda,data_servico_pos_venda,kms,total_mao_de_obra,descricao_servico_pos_venda,nome_concessao,viatura,cliente,ordem_reparacao,tipo_de_servico,origem_registo,id,pedido_do_cliente
FROM sc_gold.historico_de_servicos;


In [0]:
%sql
SELECT * FROM sc_gold.historico_de_servicos_2

In [0]:
%sql

DROP TABLE IF EXISTS sc_gold.viaturas_2;

CREATE TABLE sc_gold.viaturas_2 AS
SELECT id, designacao_comercial,modelo,motorizacao,versao,data_de_matricula,matricula,kms,cilindrada__cm3_,potencia_maxima__kw_,combustivel,gwms_engine,gwms_model_year,production_date
FROM sc_gold.viaturas;

In [0]:
%sql
SELECT * FROM sc_gold.viaturas_2

In [0]:
from pyspark.sql import functions as F

table_name = "sc_gold.viaturas_2"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)


In [0]:
#passo usado para 
from pyspark.sql import functions as F

table_name = "sc_gold.viaturas_2"
df0 = spark.table(table_name)


# Aplicar o filtro (excluir linhas vazias na coluna modelo -486 linhas)
df = df0.filter(
    (F.col("modelo").isNotNull())
    & (F.trim(F.col("modelo")) != "")
    & (F.lower(F.trim(F.col("modelo"))) != "null")
)

# 3) Gravar de volta sobrescrevendo a tabela original
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(table_name)


In [0]:
#passo usado para substituir a coluna cilindrada por a meida que tiver por cada grupo de gwms_engine (motor)
table_name = "sc_gold.viaturas_2"

# 1) Ler tabela
df0 = spark.table(table_name)

# 2) Normalizar / limpar a coluna de cilindrada
cil_clean_str = F.regexp_replace(F.col("cilindrada__cm3_"), r"[^0-9.]", "")
cil_null_cond = (
    (F.col("cilindrada__cm3_").isNull()) |
    (F.trim(F.col("cilindrada__cm3_")) == "") |
    (F.trim(F.col("cilindrada__cm3_")) == ".")
)

df = df0.withColumn(
    "cilindrada_num",
    F.when(cil_null_cond, None).otherwise(cil_clean_str.cast("double"))
)

# 3) Média por gwms_engine (arredondada a inteiro)
by_engine_mean = (
    df.groupBy("gwms_engine")
      .agg(F.round(F.avg("cilindrada_num"), 0).cast("int").alias("avg_cil_gwms"))
)

# 4) Média global (fallback, também arredondada)
global_mean = int(df.agg(F.round(F.avg("cilindrada_num"), 0)).first()[0])

# 5) Juntar médias e preencher nulos
df_joined = df.join(by_engine_mean, on="gwms_engine", how="left")

df_filled = df_joined.withColumn(
    "cilindrada__cm3__filled",
    F.when(
        F.col("cilindrada_num").isNull(),
        F.when(F.col("avg_cil_gwms").isNotNull(), F.col("avg_cil_gwms")).otherwise(F.lit(global_mean))
    ).otherwise(F.col("cilindrada_num").cast("int"))
)

# 6) Substituir a coluna original pelo valor preenchido
result = (
    df_filled
    .drop("cilindrada__cm3_")
    .withColumnRenamed("cilindrada__cm3__filled", "cilindrada__cm3_")
    .drop("cilindrada_num", "avg_cil_gwms")
)

# 7) Gravar numa nova tabela cleaned
result.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("sc_gold.viaturas_2_cleaned")

In [0]:
from pyspark.sql import functions as F, Window

# 1) Ler a tabela cleaned
df = spark.table("sc_gold.viaturas_2_cleaned")

# 2) Contar ocorrências de motorizacao por gwms_engine
counts = (
    df.groupBy("gwms_engine", "motorizacao")
      .agg(F.count("*").alias("cnt"))
)

# 3) Criar janela para ordenar por cnt desc dentro de cada gwms_engine
w = Window.partitionBy("gwms_engine").orderBy(F.desc("cnt"))

# 4) Adicionar ranking
counts_ranked = counts.withColumn("rn", F.row_number().over(w))

# 5) Escolher só o top 1 = moda
moda_por_engine = (
    counts_ranked.filter(F.col("rn") == 1)
                 .select("gwms_engine", F.col("motorizacao").alias("moda_motorizacao"))
)

# 6) (Opcional) juntar de volta ao dataset original
df_with_moda = df.join(moda_por_engine, on="gwms_engine", how="left")

df_with_moda.select("matricula","gwms_engine", "motorizacao", "moda_motorizacao").show(50, truncate=False)

In [0]:
from pyspark.sql import functions as F

table_name = "sc_gold.viaturas_2_cleaned"

# Load the table
df = spark.table(table_name)

# Get total rows
total_rows = df.count()

# Calculate percentage of nulls for each column
null_percentages = (
    df.select([
        (F.count(F.when(F.col(c).isNull(), c)) / total_rows * 100)
        .alias(c)
        for c in df.columns
    ])
)

display(null_percentages)