# Importações

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Carregar os dados da camada Silver

In [0]:
df_silver = spark.table("silver_ooni")

# Criar Dimensões (Esquema Estrela)

In [0]:
# Dimensão País
dim_country = (
    df_silver
    .select("probe_cc")
    .dropDuplicates()
    .withColumnRenamed("probe_cc", "country_code")
)

dim_country.write.mode("overwrite").saveAsTable("gold_dim_country")

# Dimensão teste
dim_test = (
    df_silver
    .select("test_name")
    .dropDuplicates()
    .withColumnRenamed("test_name", "test")
)

dim_test.write.mode("overwrite").saveAsTable("gold_dim_test")

# Dimensão tempo
dim_time = (
    df_silver
    .select("measurement_ts")
    .withColumn("date", F.to_date("measurement_ts"))
    .withColumn("year", F.year("measurement_ts"))
    .withColumn("month", F.month("measurement_ts"))
    .withColumn("day", F.dayofmonth("measurement_ts"))
    .dropDuplicates()
)

dim_time.write.mode("overwrite").saveAsTable("gold_dim_time")

# Dimensão Provedor 
dim_asn = (
    df_silver
    .select("probe_asn")
    .dropDuplicates()
    .withColumnRenamed("probe_asn", "asn")
)

dim_asn.write.mode("overwrite").saveAsTable("gold_dim_asn")

# Criar Tabela Fato

In [0]:
# Fato de Medições

fact_measurement = (
    df_silver
    .withColumn("date", F.to_date("measurement_ts"))
    .select(
        "measurement_uid",
        F.col("probe_cc").alias("country_code"),
        F.col("test_name").alias("test"),
        F.col("probe_asn").alias("asn"),
        "date",
        "is_anomalous",
        "is_confirmed_blocking",
        "has_failure"
    )
)

fact_measurement.write.mode("overwrite").saveAsTable("gold_fact_measurement")


# Métricas Analíticas

In [0]:
%sql
-- Comparação regional - último ano

CREATE OR REPLACE TABLE gold_censorship_by_country_test AS
SELECT
  probe_cc,
  test,
  COUNT(*)                       AS total_measurements,
  SUM(is_anomalous)              AS anomaly_count,
  ROUND(SUM(is_anomalous)/COUNT(*),4) AS anomaly_rate
FROM gold_fact_measurement
WHERE date >= add_months(current_date(), -12)
GROUP BY probe_cc, test;


In [0]:
%sql
select * 
from gold_censorship_by_country_test

In [0]:
%sql
-- Brasil x Média regional

CREATE OR REPLACE TABLE gold_brazil_vs_region AS
SELECT
  test,
  AVG(anomaly_rate)                                  AS avg_region_rate,
  MAX(CASE WHEN country_code = 'BR' THEN anomaly_rate END) AS brazil_rate
FROM gold_censorship_by_country_test
GROUP BY test;


In [0]:
%sql
select *
from gold_brazil_vs_region

In [0]:
%sql
-- Impacto temporal (volume de bloqueios)

CREATE OR REPLACE TABLE gold_blocking_over_time AS
SELECT
  date,
  country_code,
  COUNT(*) AS total_measurements,
  SUM(is_confirmed_blocking) AS confirmed_blocks
FROM gold_fact_measurement
GROUP BY date, country_code;


In [0]:
%sql
--Análise de provedores (Brasil)

CREATE OR REPLACE TABLE gold_blocking_by_asn_br AS
SELECT
  asn,
  COUNT(*) AS total_measurements,
  SUM(is_anomalous) AS anomaly_count,
  ROUND(SUM(is_anomalous)/COUNT(*),4) AS anomaly_rate
FROM gold_fact_measurement
WHERE country_code = 'BR'
GROUP BY asn
ORDER BY anomaly_rate DESC;


In [0]:
%sql
-- qualidade dos dados

CREATE OR REPLACE TABLE gold_data_quality_summary AS
SELECT
  COUNT(*)                                           AS total_records,
  ROUND(SUM(country_code IS NULL)/COUNT(*),4)        AS pct_country_missing,
  ROUND(SUM(asn IS NULL)/COUNT(*),4)                 AS pct_asn_missing
FROM gold_fact_measurement;
