In [12]:
from datetime import datetime
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import current_timestamp, date_format, col, lit
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from typing import List
from functools import reduce
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Configuração do logger
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# logger = logging.getLogger(__name__)

In [14]:
# Configurações
APP_NAMES = ["tb_service_area", "tb_servicearea"]
BASE_PATH = "/content/drive/My Drive/projetos/Projeto Integrador/dataset/"
OUTPUT_PATH = f"{BASE_PATH}silver/tb_service_area/"
TABLE_NAME = "tb_silver_service_area"

COLUMNS: List[str] = ["BusinessYear", "IssuerId",  "StateCode", "ServiceAreaId", "ServiceAreaName", "MarketCoverage", "ZipCodes", "VersionNum", "County", "CoverEntireState"] # Alterar

In [15]:
def log_info(message: str):
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{current_time} - INFO - {message}")

def log_error(message: str):
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"{current_time} - ERROR - {message}")

def create_spark_session():
    log_info("Criando sessão Spark")
    spark = SparkSession.builder.appName("UnifyServiceArea").getOrCreate()
    spark.conf.set("spark.sql.parquet.mergeSchema", "false")
    spark.conf.set("spark.sql.files.maxPartitionBytes", "128m")
    spark.conf.set("spark.sql.files.openCostInBytes", "134217728")
    spark.conf.set("spark.sql.broadcastTimeout", "3600")
    return spark

def define_silver_schema() -> StructType:
    log_info("Definindo schema para a tabela silver")
    return StructType([
        StructField(col, StringType(), True) for col in COLUMNS
    ] + [
        StructField("ingestDate", TimestampType(), False),
        StructField("partitionDate", StringType(), False),
        StructField("source", StringType(), False)
    ])

def read_and_process_data(spark: SparkSession, app_names: List[str], base_path: str, columns: List[str]) -> DataFrame:
    dfs = []
    for app_name in app_names:
        file_path = f"{base_path}bronze/{app_name}/"
        log_info(f"Tentando ler dados de {file_path}")
        try:
            df = spark.read.parquet(file_path)
            log_info(f"Dados lidos com sucesso de {file_path}")
            df_processed = df.select(*columns).distinct() \
                            .withColumn("ingestDate", current_timestamp()) \
                            .withColumn("partitionDate", date_format(current_timestamp(), "yyyyMMdd")) \
                            .withColumn("source", lit(app_name))
            dfs.append(df_processed)
        except Exception as e:
            log_error(f"Erro ao ler dados de {file_path}: {str(e)}")
            continue

    if not dfs:
        raise ValueError("Nenhum dado foi lido com sucesso")

    return reduce(DataFrame.unionAll, dfs)

def prepare_silver_data(df: DataFrame, schema: StructType) -> DataFrame:
    log_info("Preparando dados para o formato silver")
    return df.select([
        col(c).cast("string") for c in COLUMNS
    ] + [
        col("ingestDate"),
        col("partitionDate"),
        col("source")
    ]).select(schema.fieldNames())

def save_as_parquet(df: DataFrame, output_path: str):
    log_info(f"Salvando dados como Parquet em {output_path}")
    df.write \
      .mode("overwrite") \
      .partitionBy("partitionDate") \
      .parquet(output_path)

In [16]:
def main():
    try:
        spark = create_spark_session()
        silver_schema = define_silver_schema()

        df_combined = read_and_process_data(spark, APP_NAMES, BASE_PATH, COLUMNS)
        df_silver = prepare_silver_data(df_combined, silver_schema)

        save_as_parquet(df_silver, OUTPUT_PATH)

        log_info(f"Dados salvos como Parquet em: {OUTPUT_PATH}")

        # Verificar se o arquivo foi salvo corretamente
        saved_df = spark.read.parquet(OUTPUT_PATH)
        log_info("Schema dos dados salvos:")
        saved_df.printSchema()
        log_info("Primeiras 5 linhas dos dados salvos:")
        saved_df.show(5)
        log_info(f"Contagem de registros salvos: {saved_df.count()}, na tabela: {TABLE_NAME}")
        log_info("Processo concluído com sucesso")

    except Exception as e:
        log_error(f"Erro durante a execução: {str(e)}")

In [17]:
if __name__ == "__main__":
    main()

2025-03-11 18:24:48 - INFO - Criando sessão Spark
2025-03-11 18:24:48 - INFO - Definindo schema para a tabela silver
2025-03-11 18:24:48 - INFO - Tentando ler dados de /content/drive/My Drive/projetos/Projeto Integrador/dataset/bronze/tb_service_area/
2025-03-11 18:24:48 - INFO - Dados lidos com sucesso de /content/drive/My Drive/projetos/Projeto Integrador/dataset/bronze/tb_service_area/
2025-03-11 18:24:48 - INFO - Tentando ler dados de /content/drive/My Drive/projetos/Projeto Integrador/dataset/bronze/tb_servicearea/
2025-03-11 18:24:48 - INFO - Dados lidos com sucesso de /content/drive/My Drive/projetos/Projeto Integrador/dataset/bronze/tb_servicearea/
2025-03-11 18:24:49 - INFO - Preparando dados para o formato silver
2025-03-11 18:24:49 - INFO - Salvando dados como Parquet em /content/drive/My Drive/projetos/Projeto Integrador/dataset/silver/tb_service_area/
2025-03-11 18:25:14 - INFO - Dados salvos como Parquet em: /content/drive/My Drive/projetos/Projeto Integrador/dataset/silv