In [1]:
from os import environ as env
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.master("local[1]") \
                    .appName("My ETL") \
                    .config("spark.jars.packages", "com.microsoft.sqlserver:mssql-jdbc:12.4.2.jre11") \
                    .config("spark.jars", env["SPARK_JAR_PATH"]) \
                    .getOrCreate()

print("Spark Version: "+ spark.version)
print("Spark App Name: "+ spark.sparkContext.appName)

Spark Version: 3.4.0
Spark App Name: My ETL


In [3]:
ingestion_date = datetime.now().strftime("%Y-%m-%d")
server_name = env["MSSQL_HOST"]
database_name = env["MSSQL_DB"]
username = env["MSSQL_SA_USER"]
password = env["MSSQL_SA_PASSWORD"]
port = env["MSSQL_PORT"]
src_jdbc_url = f"jdbc:sqlserver://{server_name}:{port};database={database_name};user={username};password={password};trustServerCertificate=true;"
src_schema="dbo"
src_table="DimCustomer"
# Target Postgres conexion
tgt_jdbc_url = f"jdbc:postgresql://{env['POSTGRES_HOST']}:{env['POSTGRES_PORT']}/{env['POSTGRES_DB']}?user={env['POSTGRES_USER']}&password={env['POSTGRES_PASSWORD']}"
tgt_table="dim_customer"

output_path="/tmp"
final_output_path= f"{output_path}/{src_table}_{ingestion_date}"

Extract

In [4]:
try:
    sql_query = f"""SELECT * FROM {src_schema}.{src_table}"""
    
    # Read SQL Server Table to PySpark DataFrame
    df_src = spark.read.format("jdbc") \
        .option("url", src_jdbc_url) \
        .option("query", sql_query) \
        .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
        .load()

    print(f"Total rows: {df_src.count()}")
    df_src.show(5, truncate=False)
    df_src.printSchema()
except Exception as e:
    print("Error al extraer datos: " + str(e))

Total rows: 18484
+-----------+------------+--------------------+-----+---------+----------+--------+---------+----------+-------------+------+------+------------------------------+------------+-------------+--------------------+----------------+----------------+---------------+-----------------+-----------------+----------------+--------------+---------------+-------------------+------------+-------------------+-----------------+---------------+
|CustomerKey|GeographyKey|CustomerAlternateKey|Title|FirstName|MiddleName|LastName|NameStyle|BirthDate |MaritalStatus|Suffix|Gender|EmailAddress                  |YearlyIncome|TotalChildren|NumberChildrenAtHome|EnglishEducation|SpanishEducation|FrenchEducation|EnglishOccupation|SpanishOccupation|FrenchOccupation|HouseOwnerFlag|NumberCarsOwned|AddressLine1       |AddressLine2|Phone              |DateFirstPurchase|CommuteDistance|
+-----------+------------+--------------------+-----+---------+----------+--------+---------+----------+------------

Transform

In [None]:
try:
    tranformed_df = df_src \
        .filter(col("EmailAddress").isNotNull())

    tranformed_df = tranformed_df \
        .withColumn("FullName", concat_ws(", ", tranformed_df.FirstName, tranformed_df.LastName)) \
        .withColumn("Gender", when(col("Gender") == "M", "Male").when(col("Gender") == "F", "Female").otherwise("Unknown")) \
        .withColumn("DBSource", lit("AdventureWorks")) \
        .withColumn("IngestionDate", lit(ingestion_date))

    final_df = tranformed_df \
        .select(
            "CustomerKey" 
            ,"FirstName" 
            ,"LastName"
            ,"FullName" 
            ,"BirthDate"
            ,"Gender"
            ,"EmailAddress"
            ,"DBSource"
            ,"IngestionDate"
        )
    
    final_df.show(5, truncate=False)

    final_df.write.format("csv") \
            .option("header", "true") \
            .option("sep", ";") \
            .mode("overwrite") \
            .save(final_output_path)
    
    print(f"Datos cargados existosamente en {final_output_path}")
    
except Exception as e:
    print("Error al transformar los datos: " + str(e))

+-----------+---------+--------+------------------+----------+------+------------------------------+--------------+-------------+
|CustomerKey|FirstName|LastName|FullName          |BirthDate |Gender|EmailAddress                  |DBSource      |IngestionDate|
+-----------+---------+--------+------------------+----------+------+------------------------------+--------------+-------------+
|11000      |Jon      |Yang    |Jon, Yang         |1971-10-06|Male  |jon24@adventure-works.com     |AdventureWorks|2026-01-06   |
|11001      |Eugene   |Huang   |Eugene, Huang     |1976-05-10|Male  |eugene10@adventure-works.com  |AdventureWorks|2026-01-06   |
|11002      |Ruben    |Torres  |Ruben, Torres     |1971-02-09|Male  |ruben35@adventure-works.com   |AdventureWorks|2026-01-06   |
|11003      |Christy  |Zhu     |Christy, Zhu      |1973-08-14|Female|christy12@adventure-works.com |AdventureWorks|2026-01-06   |
|11004      |Elizabeth|Johnson |Elizabeth, Johnson|1979-08-05|Female|elizabeth5@adventure-

Load

In [8]:
try:
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("sep", ";") \
        .load(final_output_path)
    
    df.show(5, truncate=False) # Comprobar datos antes de cargar

    df.write.mode("overwrite") \
        .format("jdbc") \
        .option("url", tgt_jdbc_url) \
        .option("dbtable", "stg_"+tgt_table) \
        .option("driver", "org.postgresql.Driver") \
        .save()

    print(f"Datos cargados existosamente en la tabla stg_{tgt_table}")
except Exception as e:
    print("Error al cargar los datos: " + str(e))

+-----------+---------+--------+------------------+----------+------+------------------------------+--------------+-------------+
|CustomerKey|FirstName|LastName|FullName          |BirthDate |Gender|EmailAddress                  |DBSource      |IngestionDate|
+-----------+---------+--------+------------------+----------+------+------------------------------+--------------+-------------+
|11000      |Jon      |Yang    |Jon, Yang         |1971-10-06|Male  |jon24@adventure-works.com     |AdventureWorks|2026-01-06   |
|11001      |Eugene   |Huang   |Eugene, Huang     |1976-05-10|Male  |eugene10@adventure-works.com  |AdventureWorks|2026-01-06   |
|11002      |Ruben    |Torres  |Ruben, Torres     |1971-02-09|Male  |ruben35@adventure-works.com   |AdventureWorks|2026-01-06   |
|11003      |Christy  |Zhu     |Christy, Zhu      |1973-08-14|Female|christy12@adventure-works.com |AdventureWorks|2026-01-06   |
|11004      |Elizabeth|Johnson |Elizabeth, Johnson|1979-08-05|Female|elizabeth5@adventure-

In [9]:
# Stop the Spark session
spark.sparkContext.stop()