## Part 2. PySpark & HDFS Experiment

### PySpark Session and HDFS connection

In [2]:
# Import PySpark and initialize Spark session
import pyspark
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("PySparkHDFS") \
    .config("spark.jars", "../lib/mysql-connector-j-8.2.0.jar") \
    .getOrCreate()

25/12/10 11:04:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Exercise 1. Load all tables in Sakila Database into DataFrames

In [3]:
# Get all schema information
tables = (
    spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://localhost:3306/sakila") \
        .option("driver", "com.mysql.cj.jdbc.Driver") \
        .option("dbtable", "(SELECT table_name FROM information_schema.tables WHERE table_schema='sakila') AS t") \
        .option("user", "root") \
        .option("password", "rootpassword") \
        .load()
)

In [4]:
tables.collect()

                                                                                

[Row(table_name='payment'),
 Row(table_name='store'),
 Row(table_name='film'),
 Row(table_name='staff_list'),
 Row(table_name='customer'),
 Row(table_name='customer_list'),
 Row(table_name='film_actor'),
 Row(table_name='sales_by_store'),
 Row(table_name='film_text'),
 Row(table_name='inventory'),
 Row(table_name='actor'),
 Row(table_name='nicer_but_slower_film_list'),
 Row(table_name='sales_by_film_category'),
 Row(table_name='actor_info'),
 Row(table_name='language'),
 Row(table_name='staff'),
 Row(table_name='address'),
 Row(table_name='rental'),
 Row(table_name='country'),
 Row(table_name='city'),
 Row(table_name='category'),
 Row(table_name='film_list'),
 Row(table_name='film_category')]

In [5]:
table_names = [row.table_name for row in tables.collect()]
table_names


['payment',
 'store',
 'film',
 'staff_list',
 'customer',
 'customer_list',
 'film_actor',
 'sales_by_store',
 'film_text',
 'inventory',
 'actor',
 'nicer_but_slower_film_list',
 'sales_by_film_category',
 'actor_info',
 'language',
 'staff',
 'address',
 'rental',
 'country',
 'city',
 'category',
 'film_list',
 'film_category']

In [None]:
# Try to import all tables
for table_name in table_names:
    df = spark.read.format("jdbc") \
        .option("url", "jdbc:mysql://localhost:3306/sakila") \
        .option("driver", "com.mysql.cj.jdbc.Driver") \
        .option("dbtable", table_name) \
        .option("user", "root") \
        .option("password", "rootpassword") \
        .load()
    df.write.mode("overwrite").parquet(f"output/part2/exercise1/{table_name}")
    print(table_name)
    df.show(5)

                                                                                

payment
+----------+-----------+--------+---------+------+-------------------+-------------------+
|payment_id|customer_id|staff_id|rental_id|amount|       payment_date|        last_update|
+----------+-----------+--------+---------+------+-------------------+-------------------+
|         1|          1|       1|       76|  2.99|2005-05-25 11:30:37|2006-02-15 22:12:30|
|         2|          1|       1|      573|  0.99|2005-05-28 10:35:23|2006-02-15 22:12:30|
|         3|          1|       1|     1185|  5.99|2005-06-15 00:54:12|2006-02-15 22:12:30|
|         4|          1|       2|     1422|  0.99|2005-06-15 18:02:53|2006-02-15 22:12:30|
|         5|          1|       2|     1476|  9.99|2005-06-15 21:08:46|2006-02-15 22:12:30|
+----------+-----------+--------+---------+------+-------------------+-------------------+
only showing top 5 rows
store
+--------+----------------+----------+-------------------+
|store_id|manager_staff_id|address_id|        last_update|
+--------+---------------

                                                                                

actor_info


                                                                                

+--------+----------+------------+--------------------+
|actor_id|first_name|   last_name|           film_info|
+--------+----------+------------+--------------------+
|       1|  PENELOPE|     GUINESS|Animation: ANACON...|
|       2|      NICK|    WAHLBERG|Action: BULL SHAW...|
|       3|        ED|       CHASE|Action: CADDYSHAC...|
|       4|  JENNIFER|       DAVIS|Action: BAREFOOT ...|
|       5|    JOHNNY|LOLLOBRIGIDA|Action: AMADEUS H...|
+--------+----------+------------+--------------------+
only showing top 5 rows
language
+-----------+--------------------+-------------------+
|language_id|                name|        last_update|
+-----------+--------------------+-------------------+
|          1|English             |2006-02-15 05:02:19|
|          2|Italian             |2006-02-15 05:02:19|
|          3|Japanese            |2006-02-15 05:02:19|
|          4|Mandarin            |2006-02-15 05:02:19|
|          5|French              |2006-02-15 05:02:19|
+-----------+----------

                                                                                

rental
+---------+-------------------+------------+-----------+-------------------+--------+-------------------+
|rental_id|        rental_date|inventory_id|customer_id|        return_date|staff_id|        last_update|
+---------+-------------------+------------+-----------+-------------------+--------+-------------------+
|        1|2005-05-24 22:53:30|         367|        130|2005-05-26 22:04:30|       1|2006-02-15 21:30:53|
|        2|2005-05-24 22:54:33|        1525|        459|2005-05-28 19:40:33|       1|2006-02-15 21:30:53|
|        3|2005-05-24 23:03:39|        1711|        408|2005-06-01 22:12:39|       1|2006-02-15 21:30:53|
|        4|2005-05-24 23:04:41|        2452|        333|2005-06-03 01:43:41|       2|2006-02-15 21:30:53|
|        5|2005-05-24 23:05:21|        2079|        222|2005-06-02 04:33:21|       1|2006-02-15 21:30:53|
+---------+-------------------+------------+-----------+-------------------+--------+-------------------+
only showing top 5 rows
country
+------

In [7]:
# # Load all tables into DataFrames and write to HDFS
# for table_name in table_names:
#     df = spark.read.format("jdbc") \
#         .option("url", "jdbc:mysql://localhost:3306/sakila") \
#         .option("driver", "com.mysql.cj.jdbc.Driver") \
#         .option("dbtable", table_name) \
#         .option("user", "root") \
#         .option("password", "rootpassword") \
#         .load()
#     df.write.mode("overwrite").parquet(f"hdfs://localhost:9000/user/hadoop/sakila/{table_name}")

# Note: Connect Spark in host with HDFS in Docker.

### Exercise 2. Filter films released in 2008 or later.

I load the `film` table from MySQL into Spark in two ways:

#### 1. Filter at the source (Push-down to MySQL)

In [8]:
# Load schema film into dataframe with conditions
df_film_2008 = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://localhost:3306/sakila") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "(SELECT * FROM film WHERE release_year >= 2008) as t") \
    .option("user", "root") \
    .option("password", "rootpassword") \
    .load()

In [9]:
df_film_2008.show()

+-------+---------------+--------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+--------------------+-------------------+
|film_id|          title|         description|release_year|language_id|original_language_id|rental_duration|rental_rate|length|replacement_cost|rating|    special_features|        last_update|
+-------+---------------+--------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+--------------------+-------------------+
|   1001|  AI REVOLUTION|A future ruled by AI|  2024-01-01|          1|                NULL|              6|       2.99|   120|           19.99| PG   |Trailers         ...|2025-01-01 10:00:00|
|   1002|  DATA WARRIORS|Hackers fight for...|  2024-01-01|          1|                NULL|              5|       1.99|    95|           14.99| PG-13|Trailers         ...|2025-01-02 11:00:00|
|   1003|    CODE MATRIX|Programmer

In [10]:
df_film_2008.count()

10

#### 2. Load full table and filter in Spark SQL

In [11]:
df_film = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://localhost:3306/sakila") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "film") \
    .option("user", "root") \
    .option("password", "rootpassword") \
    .load()
df_film.printSchema()

root
 |-- film_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- release_year: date (nullable = true)
 |-- language_id: short (nullable = true)
 |-- original_language_id: short (nullable = true)
 |-- rental_duration: short (nullable = true)
 |-- rental_rate: decimal(4,2) (nullable = true)
 |-- length: integer (nullable = true)
 |-- replacement_cost: decimal(5,2) (nullable = true)
 |-- rating: string (nullable = true)
 |-- special_features: string (nullable = true)
 |-- last_update: timestamp (nullable = true)



In [12]:
df_film.createOrReplaceTempView("film")
film_2008 = spark.sql("SELECT * FROM film WHERE YEAR(release_year) >= 2008")
film_2008.show()

+-------+---------------+--------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+--------------------+-------------------+
|film_id|          title|         description|release_year|language_id|original_language_id|rental_duration|rental_rate|length|replacement_cost|rating|    special_features|        last_update|
+-------+---------------+--------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+--------------------+-------------------+
|   1001|  AI REVOLUTION|A future ruled by AI|  2024-01-01|          1|                NULL|              6|       2.99|   120|           19.99| PG   |Trailers         ...|2025-01-01 10:00:00|
|   1002|  DATA WARRIORS|Hackers fight for...|  2024-01-01|          1|                NULL|              5|       1.99|    95|           14.99| PG-13|Trailers         ...|2025-01-02 11:00:00|
|   1003|    CODE MATRIX|Programmer

In [None]:
df_film_2008.write.mode("overwrite").parquet("output/part2/exercise2")

### Exercise 3. Add new film records (in memory)

In [20]:
from decimal import Decimal
from datetime import date, datetime

new_records = [
    (1011, 'NEURAL VISION', 'AI-powered future surveillance system', 
     date(2024, 3, 12), 1, None, 5, Decimal("3.99"), 140, Decimal("22.99"), 
     'PG-13', 'Trailers', datetime(2025, 2, 1, 9, 15, 23)),

    (1012, 'QUANTUM BREACH', 'Hackers entering unstable quantum realm', 
     date(2025, 1, 28), 1, None, 4, Decimal("2.49"), 100, Decimal("18.99"), 
     'R', 'Commentaries', datetime(2025, 2, 2, 11, 5, 44)),

    (1013, 'COSMIC INTERRUPT', 'Astrophysicists trigger cosmic anomaly', 
     date(2024, 11, 2), 1, None, 7, Decimal("4.99"), 150, Decimal("21.99"), 
     'PG', 'Behind the Scenes', datetime(2025, 2, 3, 14, 22, 10)),

    (1014, 'SYNTHETIC SOUL', 'Robots developing emotional intelligence', 
     date(2023, 6, 9), 1, None, 6, Decimal("3.49"), 132, Decimal("19.99"), 
     'PG-13', 'Deleted Scenes', datetime(2025, 2, 4, 16, 44, 9)),

    (1015, 'EDGE OF MATRIX', 'System architects trapped in virtual grid', 
     date(2024, 9, 17), 1, None, 4, Decimal("2.99"), 118, Decimal("17.99"), 
     'R', 'Trailers', datetime(2025, 2, 5, 10, 30, 50)),

    (1016, 'PHANTOM NETWORK', 'Invisible cyberwar unfolding globally', 
     date(2025, 2, 1), 1, None, 6, Decimal("4.49"), 128, Decimal("23.99"), 
     'PG-13', 'Commentaries', datetime(2025, 2, 6, 13, 12, 33)),

    (1017, 'TERMINAL SHIFT', 'Scientists transfer consciousness digitally', 
     date(2023, 12, 21), 1, None, 3, Decimal("1.99"), 97, Decimal("14.99"), 
     'PG', 'Trailers', datetime(2025, 2, 7, 8, 58, 5)),

    (1018, 'PARALLEL DRIVE', 'Explorers navigate infinite timelines', 
     date(2024, 8, 14), 1, None, 7, Decimal("4.29"), 142, Decimal("24.99"), 
     'PG-13', 'Behind the Scenes', datetime(2025, 2, 8, 17, 41, 27)),

    (1019, 'ALGORITHM WARS', 'Autonomous AI factions battle for control', 
     date(2025, 1, 4), 1, None, 5, Decimal("3.79"), 121, Decimal("20.99"), 
     'R', 'Deleted Scenes', datetime(2025, 2, 9, 12, 25, 14)),

    (1020, 'DATALINK UPRISING', 'Interconnected databases gaining sentience', 
     date(2025, 2, 10), 1, None, 7, Decimal("2.99"), 112, Decimal("20.99"), 
     'R', 'Behind the Scenes', datetime(2025, 2, 10, 9, 45, 18))
]


In [21]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ShortType, DateType, DecimalType, TimestampType

# Definite schema
schema = StructType([
    StructField("film_id", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("release_year", DateType(), True),               # date
    StructField("language_id", ShortType(), True),               # short
    StructField("original_language_id", ShortType(), True),      # short
    StructField("rental_duration", ShortType(), True),           # short
    StructField("rental_rate", DecimalType(4, 2), True),         # decimal(4,2)
    StructField("length", IntegerType(), True),
    StructField("replacement_cost", DecimalType(5, 2), True),    # decimal(5,2)
    StructField("rating", StringType(), True),
    StructField("special_features", StringType(), True),
    StructField("last_update", TimestampType(), True)
])

df_new = spark.createDataFrame(new_records, schema=schema)
df_new.show(truncate=False)


                                                                                

+-------+-----------------+-------------------------------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+-----------------+-------------------+
|film_id|title            |description                                |release_year|language_id|original_language_id|rental_duration|rental_rate|length|replacement_cost|rating|special_features |last_update        |
+-------+-----------------+-------------------------------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+-----------------+-------------------+
|1011   |NEURAL VISION    |AI-powered future surveillance system      |2024-03-12  |1          |NULL                |5              |3.99       |140   |22.99           |PG-13 |Trailers         |2025-02-01 09:15:23|
|1012   |QUANTUM BREACH   |Hackers entering unstable quantum realm    |2025-01-28  |1          |NULL                |4              |2.49   

In [22]:
# Append new records to original film DataFrame
df_film_updated = df_film.unionByName(df_new)
df_film_updated.count(), df_film.count()

                                                                                

(1020, 1010)

In [23]:
df_film_updated.createOrReplaceTempView("film_updated")
spark.sql("SELECT * FROM film_updated WHERE film_id > 1010").show(truncate=False)

+-------+-----------------+-------------------------------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+-----------------+-------------------+
|film_id|title            |description                                |release_year|language_id|original_language_id|rental_duration|rental_rate|length|replacement_cost|rating|special_features |last_update        |
+-------+-----------------+-------------------------------------------+------------+-----------+--------------------+---------------+-----------+------+----------------+------+-----------------+-------------------+
|1011   |NEURAL VISION    |AI-powered future surveillance system      |2024-03-12  |1          |NULL                |5              |3.99       |140   |22.99           |PG-13 |Trailers         |2025-02-01 09:15:23|
|1012   |QUANTUM BREACH   |Hackers entering unstable quantum realm    |2025-01-28  |1          |NULL                |4              |2.49   

In [None]:
df_film_updated.write.mode("overwrite").parquet("output/part2/exercise3")

                                                                                

### Stop PySpark Session

In [26]:
spark.stop()