In [0]:
%sql
SELECT * FROM workspace.sc_silver.deals

In [0]:
%sql
SELECT * FROM workspace.sc_silver.leads_pbs

In [0]:
%sql
SELECT * FROM workspace.sc_silver.campaigns

In [0]:
%sql
SELECT * FROM workspace.sc_silver.contactos_pbs

In [0]:
%sql
CREATE OR REPLACE TABLE workspace.sc_silver.campaigns AS
SELECT
  _fivetran_synced,
  _fivetran_deleted,
  _fivetran_index,
  layout,
  codigo_campanha,
  modified_time,
  created_time,
  campaign_owner_name,
  campaign_name,
  id,
  _fivetran_id
FROM workspace.sc_silver.campaigns;


In [0]:
%sql
SELECT COUNT(*) AS num_duplicate_ids
FROM (
  SELECT id
  FROM workspace.sc_silver.contactos_pbs
  GROUP BY id
  HAVING COUNT(*) > 1
);


In [0]:
%sql
SELECT COUNT(*) AS num_duplicate_converted_contacts
FROM (
  SELECT converted_contact
  FROM workspace.sc_silver.leads_pbs
  WHERE converted_contact IS NOT NULL
  GROUP BY converted_contact
  HAVING COUNT(*) > 1
);


In [0]:
%sql
SELECT COUNT(*) AS total_rows
FROM workspace.sc_silver.leads_pbs;


In [0]:
%sql
SELECT COUNT(id) AS non_null_ids
FROM workspace.sc_silver.contactos_pbs;


In [0]:
%sql
SELECT COUNT(DISTINCT c.id) AS common_unique_ids
FROM workspace.sc_silver.contactos_pbs c
JOIN workspace.sc_silver.leads_pbs l
  ON c.id = l.converted_contact;


In [0]:
%sql
SELECT COUNT(DISTINCT leads_pbs.converted_contact) AS common_ids_between_leads_and_campaigns
FROM workspace.sc_silver.leads_pbs
JOIN workspace.sc_silver.campaigns
  ON leads_pbs.converted_contact = campaigns.id
WHERE leads_pbs.converted_contact IS NOT NULL;


In [0]:
%sql
-- Check which campaign IDs exist in leads.id
SELECT DISTINCT id AS campaign_id_in_leads_id
FROM workspace.sc_silver.campaigns
WHERE id IN (
  SELECT id FROM workspace.sc_silver.leads_pbs
);


In [0]:
%sql
SELECT
  campaigns.id AS campaign_id,
  CASE WHEN leads_by_id.id IS NOT NULL THEN 'Yes' ELSE 'No' END AS found_in_leads_id,
  CASE WHEN leads_by_owner.lead_owner IS NOT NULL THEN 'Yes' ELSE 'No' END AS found_in_lead_owner
FROM workspace.sc_silver.campaigns

LEFT JOIN workspace.sc_silver.leads_pbs AS leads_by_id
  ON campaigns.id = leads_by_id.id

LEFT JOIN workspace.sc_silver.leads_pbs AS leads_by_owner
  ON campaigns.id = leads_by_owner.lead_owner;


In [0]:
%sql
SELECT
  campaigns.id AS campaign_id,
  
  -- Lookup flags
  CASE WHEN leads_by_id.id IS NOT NULL THEN 'Yes' ELSE 'No' END AS found_in_leads_id,
  CASE WHEN leads_by_owner.lead_owner IS NOT NULL THEN 'Yes' ELSE 'No' END AS found_in_lead_owner,

  -- Add columns from leads_pbs
  leads_by_id.id AS id_lead,
  leads_by_id.converted_contact

FROM workspace.sc_silver.campaigns

-- Lookup campaigns.id in leads_pbs.id
LEFT JOIN workspace.sc_silver.leads_pbs AS leads_by_id
  ON campaigns.id = leads_by_id.id

-- Lookup campaigns.id in leads_pbs.lead_owner
LEFT JOIN workspace.sc_silver.leads_pbs AS leads_by_owner
  ON campaigns.id = leads_by_owner.lead_owner;


In [0]:
%sql
SELECT
  c.id AS contacto_id,

  -- Flags de correspondência
  CASE WHEN p1.id IS NOT NULL THEN 'Yes' ELSE 'No' END AS match_with_proposta_id,
  CASE WHEN p2.proposta_realizada_owner IS NOT NULL THEN 'Yes' ELSE 'No' END AS match_with_proposta_owner,
  CASE WHEN p3.id_contacto IS NOT NULL THEN 'Yes' ELSE 'No' END AS match_with_id_contacto

FROM workspace.sc_silver.contactos_pbs c

-- Join com propostas_realizadas.id
LEFT JOIN workspace.sc_silver.propostas_realizadas p1
  ON c.id = p1.id

-- Join com propostas_realizadas.proposta_realizada_owner
LEFT JOIN workspace.sc_silver.propostas_realizadas p2
  ON c.id = p2.proposta_realizada_owner

-- Join com propostas_realizadas.id_contacto
LEFT JOIN workspace.sc_silver.propostas_realizadas p3
  ON c.id = p3.id_contacto;


In [0]:
%sql
SELECT COUNT(DISTINCT p3.id_contacto) AS matching_contact_ids
FROM workspace.sc_silver.propostas_realizadas p3
JOIN workspace.sc_silver.contactos_pbs c
    ON p3.id_contacto = c.id
WHERE p3.id_contacto IS NOT NULL;


In [0]:
%sql
SELECT COUNT(DISTINCT p.id_contacto) AS matching_contact_ids
FROM workspace.sc_silver.propostas_realizadas p
JOIN workspace.sc_silver.deals d
  ON p.id_contacto = d.id_contacto
WHERE p.id_contacto IS NOT NULL;


In [0]:
%sql
DELETE FROM workspace.sc_silver.leads_pbs
WHERE converted_contact IS NULL;


In [0]:
%sql
SELECT COUNT(*) AS total_rows
FROM workspace.sc_silver.leads_pbs;


In [0]:
%sql
SELECT
  l.converted_contact,
  CASE WHEN d1.id IS NOT NULL THEN 'Yes' ELSE 'No' END AS match_in_deals_id,
  CASE WHEN d2.converted_from_lead IS NOT NULL THEN 'Yes' ELSE 'No' END AS match_in_deals_converted_from_lead
FROM workspace.sc_silver.leads_pbs l
LEFT JOIN workspace.sc_silver.deals d1
  ON l.converted_contact = d1.id
LEFT JOIN workspace.sc_silver.deals d2
  ON l.converted_contact = d2.converted_from_lead
WHERE l.converted_contact IS NOT NULL;


In [0]:
%sql
-- How many leads match deals.id
SELECT COUNT(DISTINCT l.id) AS leads_in_deals_id
FROM workspace.sc_silver.leads_pbs l
JOIN workspace.sc_silver.deals d ON l.id = d.id;

-- How many leads match deals.converted_from_lead
SELECT COUNT(DISTINCT l.id) AS leads_in_converted_from_lead
FROM workspace.sc_silver.leads_pbs l
JOIN workspace.sc_silver.deals d ON l.id = d.converted_from_lead;


In [0]:
%sql
SELECT COUNT(DISTINCT d.campanha) AS matched_campaigns_in_deals
FROM workspace.sc_silver.deals d
JOIN workspace.sc_silver.campaigns c
  ON d.campanha = c.id
WHERE d.campanha IS NOT NULL;


In [0]:
%sql
SELECT COUNT(*) AS num_duplicate_converted_contacts
FROM (
  SELECT converted_contact
  FROM workspace.sc_silver.leads_pbs
  WHERE converted_contact IS NOT NULL
  GROUP BY converted_contact
  HAVING COUNT(*) > 1
);


In [0]:
%sql
SELECT
  occurrences,
  COUNT(*) AS num_contacts_with_this_occurrence
FROM (
  SELECT converted_contact, COUNT(*) AS occurrences
  FROM workspace.sc_silver.leads_pbs
  WHERE converted_contact IS NOT NULL
  GROUP BY converted_contact
) AS sub
GROUP BY occurrences
ORDER BY occurrences;


In [0]:
%sql
SELECT COUNT(*) AS num_duplicate_ids
FROM (
  SELECT id
  FROM workspace.sc_silver.campaigns
  GROUP BY id
  HAVING COUNT(*) > 1
);


In [0]:
%sql
SELECT 
  id_contacto, 
  COUNT(*) AS occurrences
FROM workspace.sc_silver.deals
WHERE id_contacto IS NOT NULL
GROUP BY id_contacto
HAVING COUNT(*) > 1
ORDER BY occurrences DESC;


In [0]:
%sql
SELECT
  COUNT(*) AS total_rows,
  COUNT(CASE WHEN apoio_concessionario IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_apoio_concessionario,
  COUNT(CASE WHEN apoio_total IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_apoio_total,
  COUNT(CASE WHEN sub_total_com_extras IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_sub_total_com_extras,
  COUNT(CASE WHEN descricao_do_pedido_de_apoio IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_descricao_do_pedido_de_apoio,
  COUNT(CASE WHEN estado_do_pedido IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_estado_do_pedido,
  COUNT(CASE WHEN apoio_percentual IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_apoio_percentual,
  COUNT(CASE WHEN data_de_criacao_da_proposta IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_data_de_criacao_da_proposta,
  COUNT(CASE WHEN valor_campanhas_comerciais IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_valor_campanhas_comerciais,
  COUNT(CASE WHEN ofertas_de_campanha IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_ofertas_de_campanha,
  COUNT(CASE WHEN resposta_do_importador IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_resposta_do_importador,
  COUNT(CASE WHEN data_de_entrega_da_proposta IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_data_de_entrega_da_proposta,
  COUNT(CASE WHEN desconto_total__c__apoio_de_importador_ IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_desconto_total__c__apoio_de_importador_,
  COUNT(CASE WHEN codigo_cor_exterior IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_codigo_cor_exterior,
  COUNT(CASE WHEN codigo_cor_interior IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_codigo_cor_interior,
  COUNT(CASE WHEN concessionario_owner IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_concessionario_owner,
  COUNT(CASE WHEN valor_aprovado IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_valor_aprovado,
  COUNT(CASE WHEN estado_do_contrato IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_estado_do_contrato,
  COUNT(CASE WHEN data_prevista_matricula IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_data_prevista_matricula,
  COUNT(CASE WHEN data_prevista_de_entrega IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_data_prevista_de_entrega,
  COUNT(CASE WHEN data_da_conclusao IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_data_da_conclusao,
  COUNT(CASE WHEN id_classe IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_id_classe,
  COUNT(CASE WHEN descricao_classe IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_descricao_classe,
  COUNT(CASE WHEN id_model_group IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_id_model_group,
  COUNT(CASE WHEN descricao_model_group IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_descricao_model_group,
  COUNT(CASE WHEN valid_until IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_valid_until,
  COUNT(CASE WHEN forma_de_pagamento IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_forma_de_pagamento,
  COUNT(CASE WHEN nome_da_campanha IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_nome_da_campanha,
  COUNT(CASE WHEN _fivetran_index IS NULL THEN 1 END) * 100.0 / COUNT(*) AS pct_null_fivetran_index
FROM workspace.sc_silver.propostas_realizadas;


In [0]:
%sql
ALTER TABLE workspace.sc_silver.propostas_realizadas
DROP COLUMNS (
  descricao_do_pedido_de_apoio,
  apoio_percentual,
  valor_campanhas_comerciais,
  ofertas_de_campanha,
  resposta_do_importador,
  valid_until,
  nome_da_campanha
);


In [0]:
%sql
ALTER TABLE workspace.sc_silver.contactos_pbs
DROP COLUMNS (
  last_name,
  row_names,
  email,
  mobile,
  first_name,
  phone,
  date_of_birth,
  email_opt_out,
  salutation,
  tag,
  codigo_postal,
  login_myhyundai,
  full_name,
  pioneiros_myhyundai,
  lead_source,
  nif,
  modo_de_contacto_preferencial,
  distrito,
  pais,
  sexo,
  localidade,
  morada,
  concelho,
  consentimento,
  data_de_recolha_de_consentimento_hyundai,
  description,
  link_centro_consentimento,
  ccupdated
);


In [0]:
%sql
ALTER TABLE workspace.sc_silver.leads_pbs
DROP COLUMNS (
  last_name,
  row_names,
  email,
  mobile,
  telefone_empresa,
  e_mail_empresa,
  first_name,
  zip_code,
  description,
  salutation,
  motivo,
  hmod___hcreat,
  campaign_name,
  campaign_medium,
  campaign_term,
  campaign_content,
  campaignsource,
  con,
  caracterizacao,
  test_check1,
  test_check,
  link_centro_consentimento,
  industry,
  login_myhyundai,
  website,
  full_name,
  pontuacao,
  data_de_recolha_de_consentimento_hyundai,
  ccupdated
);


In [0]:
%sql
ALTER TABLE workspace.sc_silver.deals
DROP COLUMNS (
  row_names,
  tag,
  amount,
  expected_revenue,
  pioneiros_myhyundai,
  lead_source,
  login_myhyundai,
  aprovacao_do_pedido_de_apoio,
  `type`        -- quote reserved word
);


In [0]:
%sql
ALTER TABLE workspace.sc_silver.propostas_realizadas
DROP COLUMNS (
  sales_status,
  row_names,
  tax,
  discount,
  adjustments,
   estado_do_contrato
  );



In [0]:
from pyspark.sql import functions as F

# ==== DEALS ====
df_deals = spark.table("workspace.sc_silver.deals")
df_deals_renamed = df_deals.select(
    *[F.col(c).alias(f"{c}_deals") for c in df_deals.columns]
)
df_deals_renamed.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.deals_renamed")
display(spark.table("workspace.sc_silver.deals_renamed"))

# ==== CAMPAIGNS ====
df_campaigns = spark.table("workspace.sc_silver.campaigns")
df_campaigns_renamed = df_campaigns.select(
    *[F.col(c).alias(f"{c}_campaigns") for c in df_campaigns.columns]
)
df_campaigns_renamed.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.campaigns_renamed")
display(spark.table("workspace.sc_silver.campaigns_renamed"))


In [0]:
from pyspark.sql import functions as F

# Load renamed tables
df_deals_renamed = spark.table("workspace.sc_silver.deals_renamed")
df_campaigns_renamed = spark.table("workspace.sc_silver.campaigns_renamed")

# Join on campanha_deals = id_campaigns
df_deals_with_campaigns = df_deals_renamed.join(
    df_campaigns_renamed,
    df_deals_renamed.campanha_deals == df_campaigns_renamed.id_campaigns,
    "left"
)

# Save as permanent table
df_deals_with_campaigns.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.deals_with_campaigns")

# Preview
display(spark.table("workspace.sc_silver.deals_with_campaigns"))


In [0]:
from pyspark.sql import functions as F

# Load original table
df = spark.table("workspace.sc_silver.propostas_realizadas")

# Build SELECT list with aliases (append _propostas_realizadas to each column)
select_expr = [F.col(c).alias(f"{c}_propostas_realizadas") for c in df.columns]
df_renamed = df.select(*select_expr)

# Write to a NEW table so you don't overwrite original
df_renamed.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.propostas_realizadas_renamed")

# Display the new table
display(spark.table("workspace.sc_silver.propostas_realizadas_renamed"))

In [0]:
#  JOIN propostas_realizadas_renamed with deals_with_campaigns ====

# Load both tables
df_propostas_renamed = spark.table("workspace.sc_silver.propostas_realizadas_renamed")
df_deals_with_campaigns = spark.table("workspace.sc_silver.deals_with_campaigns")

# LEFT join from propostas to deals
df_joined = df_propostas_renamed.join(
    df_deals_with_campaigns,
    df_propostas_renamed.id_contacto_propostas_realizadas == df_deals_with_campaigns.id_contacto_deals,
    "left"
)

# Save as propostas_with_deals_with_campaigns
df_joined.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.propostas_with_deals_with_campaigns")

# Display the result
display(spark.table("workspace.sc_silver.propostas_with_deals_with_campaigns"))

In [0]:
%sql
SELECT *
FROM workspace.sc_silver.leads_pbs
LIMIT 100;


In [0]:
%sql
SELECT *
FROM workspace.sc_silver.contactos_pbs
LIMIT 100;


In [0]:
%sql
SELECT COUNT(DISTINCT c.id) AS common_unique_ids
FROM workspace.sc_silver.contactos_pbs c
JOIN workspace.sc_silver.leads_pbs l
  ON c.id = l.converted_contact;


In [0]:
%sql
SELECT 
  COUNT(DISTINCT c.id) AS common_unique_ids,
  COUNT(DISTINCT CASE 
    WHEN c.data_criacao_da_lead = l.created_time THEN c.id
  END) AS common_unique_ids_with_date
FROM workspace.sc_silver.contactos_pbs c
JOIN workspace.sc_silver.leads_pbs l
  ON c.id = l.converted_contact;


In [0]:
from pyspark.sql import functions as F

# ==== LEADS ====
df_leads = spark.table("workspace.sc_silver.leads_pbs")
df_leads_renamed = df_leads.select(
    *[F.col(c).alias(f"{c}_leads") for c in df_leads.columns]
)
df_leads_renamed.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.leads_renamed")
display(spark.table("workspace.sc_silver.leads_renamed"))

# ==== CONTACTOS ====
df_contactos = spark.table("workspace.sc_silver.contactos_pbs")
df_contactos_renamed = df_contactos.select(
    *[F.col(c).alias(f"{c}_contactos") for c in df_contactos.columns]
)
df_contactos_renamed.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.contactos_renamed")
display(spark.table("workspace.sc_silver.contactos_renamed"))


In [0]:
from pyspark.sql import functions as F

# Load both renamed tables
df_leads = spark.table("workspace.sc_silver.leads_renamed")
df_contactos = spark.table("workspace.sc_silver.contactos_renamed")

# LEFT join on both ID and date
df_joined = df_leads.join(
    df_contactos,
    (df_leads.converted_contact_leads == df_contactos.id_contactos) &
    (F.to_timestamp(df_leads.created_time_leads, "dd-MM-yyyy HH:mm") ==
     F.to_timestamp(df_contactos.data_criacao_da_lead_contactos, "dd-MM-yyyy HH:mm")),
    "left"
)

# Save as a new permanent table
df_joined.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.leads_with_contactos")

# Display preview
display(spark.table("workspace.sc_silver.leads_with_contactos"))


In [0]:
from pyspark.sql import functions as F

# Load the new table
df_new = spark.table("workspace.sc_silver.leads_with_contactos")

# Calculate the counts
df_counts = df_new.agg(
    F.countDistinct("id_contactos").alias("common_unique_ids"),
    F.countDistinct(
        F.when(
            F.to_timestamp(df_new.data_criacao_da_lead_contactos, "dd-MM-yyyy HH:mm") ==
            F.to_timestamp(df_new.created_time_leads, "dd-MM-yyyy HH:mm"),
            df_new.id_contactos
        )
    ).alias("common_unique_ids_with_date")
)

df_counts.show()


In [0]:
%sql

SELECT *
FROM workspace.sc_silver.propostas_with_deals_with_campaigns
LIMIT 100;


In [0]:
%sql
-- First table
SELECT *
FROM workspace.sc_silver.leads_with_contactos
LIMIT 100;


In [0]:
from pyspark.sql import functions as F

# Load both tables
df_leads_contactos = spark.table("workspace.sc_silver.leads_with_contactos")
df_propostas = spark.table("workspace.sc_silver.propostas_with_deals_with_campaigns")

# LEFT join on id_contactos vs id_contacto_propostas_realizadas
df_funil = df_leads_contactos.join(
    df_propostas,
    df_leads_contactos.id_contactos == df_propostas.id_contacto_propostas_realizadas,
    "left"
)

# Save as permanent table
df_funil.write.format("delta").mode("overwrite") \
    .saveAsTable("workspace.sc_silver.Funil")

# Preview
display(spark.table("workspace.sc_silver.Funil"))


In [0]:
%sql
SHOW TABLES IN workspace.sc_silver;

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import (
    StringType, DoubleType, FloatType
)

# Load table
df = spark.table("sc_silver.funil")

total_rows = df.count()
if total_rows == 0:
    spark.createDataFrame([], "column string, blank_pct double").show()
else:
    # Build one aggregation per column
    agg_exprs = []
    col_names = []
    for field in df.schema.fields:
        c = field.name
        dt = field.dataType

        if isinstance(dt, StringType):
            # NULL or empty/whitespace
            cond = F.col(c).isNull() | (F.length(F.trim(F.col(c))) == 0)
        elif isinstance(dt, (DoubleType, FloatType)):
            # NULL or NaN
            cond = F.col(c).isNull() | F.isnan(F.col(c))
        else:
            # Other types: only NULL counts as blank
            cond = F.col(c).isNull()

        agg_exprs.append(F.round(F.avg(F.when(cond, 1).otherwise(0)) * 100, 4).alias(c))
        col_names.append(c)

    # One row with % per column
    pct_row = df.agg(*agg_exprs)

    # Reshape to (column, blank_pct)
    stack_expr = ", ".join([f"'{c}', `{c}`" for c in col_names])
    tidy = pct_row.selectExpr(f"stack({len(col_names)}, {stack_expr}) as (column, blank_pct)")

    tidy.orderBy(F.desc("blank_pct")).show(truncate=False)


In [0]:
tidy.orderBy(F.desc("blank_pct")).show(n=tidy.count(), truncate=False)


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType, FloatType

tables = [
    "bd_rede_hyundai",
    "campanha_vouchers",
    "campanhas_tecnicas",
    "contactos_importador_pbs",
    "contratos_financiamento",
    "historico_de_servicos",
    "viaturas",
    "viaturas_demo"
]

def blank_percentage_for_table(table_name):
    print(f"=== Processing table: {table_name} ===")
    df = spark.table(f"sc_silver.{table_name}")
    total_rows = df.count()

    if total_rows == 0:
        print(f"Table {table_name} is empty.")
        return spark.createDataFrame([], "column string, blank_pct double")

    agg_exprs = []
    col_names = []

    for field in df.schema.fields:
        c = field.name
        dt = field.dataType

        if isinstance(dt, StringType):
            cond = F.col(c).isNull() | (F.length(F.trim(F.col(c))) == 0)
        elif isinstance(dt, (DoubleType, FloatType)):
            cond = F.col(c).isNull() | F.isnan(F.col(c))
        else:
            cond = F.col(c).isNull()

        agg_exprs.append(F.round(F.avg(F.when(cond, 1).otherwise(0)) * 100, 4).alias(c))
        col_names.append(c)

    pct_row = df.agg(*agg_exprs)
    stack_expr = ", ".join([f"'{c}', `{c}`" for c in col_names])
    tidy = pct_row.selectExpr(f"stack({len(col_names)}, {stack_expr}) as (column, blank_pct)")

    return tidy.orderBy(F.desc("blank_pct"))

# Loop through all tables and show results
for tbl in tables:
    result_df = blank_percentage_for_table(tbl)
    result_df.show(n=result_df.count(), truncate=False)


In [0]:
%sql
-- bd_rede_hyundai
ALTER TABLE sc_silver.bd_rede_hyundai
DROP COLUMNS (
  changed_by,
  created_by,
  tag,
  dt_created,
  dt_changed
);

-- campanha_vouchers
ALTER TABLE sc_silver.campanha_vouchers
DROP COLUMNS (
  email,
  unsubscribed_time,
  tag,
  secondary_email,
  unsubscribed_mode
);

-- campanhas_tecnicas
ALTER TABLE sc_silver.campanhas_tecnicas
DROP COLUMNS (
  email,
  unsubscribed_mode,
  tag,
  unsubscribed_time,
  secondary_email
);

-- contactos_importador_pbs
ALTER TABLE sc_silver.contactos_importador_pbs
DROP COLUMNS (
  city,
  chave_instalacao_bd_rede,
  instalacao,
  ticket_suporte,
  tag,
  region,
  country,
  pioneiros_myhyundai,
  chave_concessao_bd_rede,
  equipa,
  formulario
);

-- contratos_financiamento
ALTER TABLE sc_silver.contratos_financiamento
DROP COLUMNS (
  secondary_email,
  unsubscribed_mode,
  unsubscribed_time,
  valor_residual_da_viatura,
  tag
);

-- viaturas
ALTER TABLE sc_silver.viaturas
DROP COLUMNS (
  description_dtc_5,
  secondary_email,
  description_dtc_1,
  description_dtc_10,
  description_dtc_4,
  n__da_fatura,
  description_dtc_7,
  email,
  preco_unit_venda,
  description_dtc_6,
  n__dav,
  canal_de_venda,
  financeira_da_viatura,
  description_dtc_8,
  description_dtc_3,
  unsubscribed_time,
  dealer_name,
  description_dtc_9,
  dealer_code,
  origem_viatura,
  cod_postal_concessao,
  description_dtc_2,
  endereco_concessao,
  unsubscribed_mode,
  tem_contrato_de_manutencao,
  tag,
  extensao_de_garantia
);

-- viaturas_demo
ALTER TABLE sc_silver.viaturas_demo
DROP COLUMNS (
  preco_base,
  email,
  pintura,
  secondary_email,
  unsubscribed_mode,
  iva,
  isv,
  unsubscribed_time,
  motivo,
  sgpu,
  tag
);
