# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Storage Solutions (PostgreSQL)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on storage solutions with PosgreSQL") \
    .master("spark://bb6818473482:7077") \
    .config("spark.jars", "/opt/spark/work-dir/jars/postgresql-42.7.8.jar") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

25/10/01 01:13:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Create DataFrames

In [2]:
from pcamarillor.spark_utils import SparkUtils
from pyspark.sql.functions import when, concat, col, lit

airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)

base_path = "/opt/spark/work-dir/data/"
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv(base_path + "/airline/")


# Perform transfromations

## Clean data

In [3]:
df_airlines = df_airlines.dropna()

## Normalize stops

In [4]:
df_airlines = df_airlines.withColumn("stops_as_num",
                                     when(col("stops") == "zero", 0)
                                    .when(col("stops") == "one", 1)
                                    .when(col("stops") == "two", 2)
                                    .when(col("stops") == "three", 3)
                                    .otherwise(4))

# Create route column

In [5]:
df_airlines = df_airlines.withColumn("route", concat(col("source_city"), lit(" → "), col("destination_city")))

# Write data to a PostgreSQL DB

## Install PostgreSQL with Docker


    docker run -d --name postgres-iteso --network spark_default -e POSTGRES_PASSWORD=Admin@1234 postgres

In [6]:
!pwd

/opt/spark/work-dir/examples


In [7]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"
table_name = "airlines_transformed"

df_airlines.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("user", "postgres") \
    .option("password", "Admin@1234") \
    .option("driver", "org.postgresql.Driver") \
    .save()

print("DataFrame successfully written into a PosgreSQL DB !")



DataFrame successfully written into a PosgreSQL DB !


                                                                                

# Read data from a PosgreSQL DB

In [8]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"
db_properties = {
      "user": "postgres",      
      "password": "Admin@1234",
      "driver": "org.postgresql.Driver"
  }

df = spark.read \
    .jdbc(url=jdbc_url, table=table_name, properties=db_properties)

df.printSchema()
df.show(5, truncate=False)

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- stops_as_num: integer (nullable = true)
 |-- route: string (nullable = true)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+------------+--------------+
|index|airline |flight |source_city|departure_time|stops|arrival_time |destination_city|class  |duration|days_left|price|stops_as_num|route         |
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+--

In [9]:
sc.stop()