# Importa√ß√µes

In [0]:
import requests
import json
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Defini√ß√£o de par√¢metros

In [0]:
# apagar download anterior
dbutils.fs.rm("/mnt/datalake/bronze/ooni", recurse=True)


BASE_URL = "https://api.ooni.io/api/v1/measurements"

COUNTRIES = [
    "AR", # Argentina
    "BO", # Bol√≠via
    "BR", # Brasil
    "CL", # Chile
    "CO", # Col√¥mbia
    "CR", # Costa Rica
    "CU", # Cuba
    "DO", # Rep√∫blica Dominicana
    "EC", # Equador
    "SV", # El Salvador
    "GT", # Guatemala
    "HN", # Honduras
    "MX", # M√©xico
    "NI", # Nicar√°gua
    "PA", # Panam√°
    "PY", # Paraguai
    "PE", # Peru
    "UY", # Uruguai
    "VE"  # Venezuela
]

TEST_NAMES = [
    "web_connectivity",
    "facebook_messenger",
    "telegram",
    "whatsapp"
]

START_YEAR = 2018
END_YEAR = 2024

LIMIT = 1000  # m√°ximo permitido pela API


# Coleta dos dados

In [0]:
def fetch_ooni_data(params):
    all_results = []
    next_cursor = None

    while True:
        if next_cursor:
            params["cursor"] = next_cursor

        response = requests.get(BASE_URL, params=params)
        response.raise_for_status()
        data = response.json()

        all_results.extend(data.get("results", []))

        next_cursor = data.get("metadata", {}).get("next_cursor")
        if not next_cursor:
            break

    return all_results


bronze_path = "/mnt/datalake/bronze/ooni"

for country in COUNTRIES:
    for year in range(START_YEAR, END_YEAR + 1):
        for test in TEST_NAMES:

            params = {
                "probe_cc": country,
                "since": f"{year}-01-01",
                "until": f"{year}-12-31",
                "test_name": test,
                "limit": LIMIT
            }

            print(f"üì• {country} {year} {test}")
            results = fetch_ooni_data(params)

            if not results:
                print("‚ö†Ô∏è Nenhum dado")
                continue

            df = spark.read.json(
                spark.sparkContext.parallelize(
                    [json.dumps(r) for r in results]
                )
            )

            (
                df.write
                .mode("append")
                .json(
                    f"{bronze_path}/test={test}/country={country}/year={year}"
                )
            )

            print(f"‚úÖ {df.count()} registros")


# Valida√ß√£o da integridade dos dados

In [0]:
# qtd diret√≥rios
print(f'qtd diret√≥rios: {len(dbutils.fs.ls("/mnt/datalake/bronze/ooni"))}')

# contar registros
df = spark.read.json("/mnt/datalake/bronze/ooni")
print(f'qtd registros: {df.count()}')



### Carregar dados brutos

In [0]:
bronze_path = "/mnt/datalake/bronze/ooni"
df = spark.read.json(bronze_path)

### Conferir completude de campos

In [0]:
df.select(
    "probe_cc", 
    "probe_asn", 
    "test_name", 
    "measurement_start_time"
).describe().show()

### Contagem de nulos

In [0]:
df.select(
    [F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in ["probe_cc", "probe_asn", "test_name", "measurement_start_time"]]
).show()

### Criar tabela relacional

In [0]:
df.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("bronze_ooni")

In [0]:
%sql
SELECT DISTINCT test_name FROM bronze_ooni

In [0]:
%sql

describe bronze_ooni

In [0]:
%sql
SELECT scores.* FROM bronze_ooni