# ETL Pipeline with Spark

In [1]:
from pyspark.sql import SparkSession

# create Spark session with mysql connector jar
spark = SparkSession.builder \
    .master('local') \
    .config("spark.jars", "/home/phinguyen/lib/mysql-connector-j-8.0.33.jar") \
    .appName('ETL_Pipeline_Testing') \
    .getOrCreate()
spark

24/05/22 18:16:58 WARN Utils: Your hostname, desktop resolves to a loopback address: 127.0.1.1; using 172.18.52.176 instead (on interface eth0)
24/05/22 18:16:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/05/22 18:17:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# get table Customer from mysql
customer_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://localhost:3306/retail_db") \
    .option("driver", "com.mysql.cj.jdbc.Driver") \
    .option("dbtable", "Customer") \
    .option("user", "admin") \
    .option("password", "adminpassword") \
    .load()

In [3]:
customer_df

DataFrame[CustomerID: int, Name: string, Email: string, Age: int, ModifiedDate: timestamp]

In [4]:
# create SQL view
customer_df.createOrReplaceTempView("mySQL_customer_tbl")

In [5]:
# run SQL query on the view
spark.sql("""
    SELECT 
        *
    FROM mySQL_customer_tbl
    LIMIT 3
""").show()

                                                                                

+----------+----------------+--------------------+---+-------------------+
|CustomerID|            Name|               Email|Age|       ModifiedDate|
+----------+----------------+--------------------+---+-------------------+
|         1|Christina Savage| vgreene@example.com| 59|2022-10-20 00:00:00|
|         2|   Zachary Green|costajohn@example...| 85|2022-10-20 00:00:00|
|         3|   Andrea Wilson|  john53@example.org| 73|2022-10-20 00:00:00|
+----------+----------------+--------------------+---+-------------------+



In [12]:
max_cus = spark.sql("""
    SELECT
        MAX(CustomerID)
    FROM mySQL_customer_tbl
""")

In [15]:
max_cus.collect()[0][0]

103358

In [11]:
customer_df.selectExpr("MAX(CustomerID)").collect()[0][0]

103358

In [6]:
# transform the spark dataframe
customer_output_df = spark.sql("""
    SELECT
        *,
        YEAR(ModifiedDate) AS year,
        MONTH(ModifiedDate) AS month,
        DAY(ModifiedDate) AS day
    FROM mySQL_customer_tbl
""")
print(type(customer_output_df))
customer_output_df.show(3)

<class 'pyspark.sql.dataframe.DataFrame'>
+----------+----------------+--------------------+---+-------------------+----+-----+---+
|CustomerID|            Name|               Email|Age|       ModifiedDate|year|month|day|
+----------+----------------+--------------------+---+-------------------+----+-----+---+
|         1|Christina Savage| vgreene@example.com| 59|2022-10-20 00:00:00|2022|   10| 20|
|         2|   Zachary Green|costajohn@example...| 85|2022-10-20 00:00:00|2022|   10| 20|
|         3|   Andrea Wilson|  john53@example.org| 73|2022-10-20 00:00:00|2022|   10| 20|
+----------+----------------+--------------------+---+-------------------+----+-----+---+
only showing top 3 rows



In [7]:
from py4j.java_gateway import java_import

# Import necessary classes from Java
java_import(spark._jvm, 'org.apache.hadoop.fs.FileSystem')
java_import(spark._jvm, 'org.apache.hadoop.fs.Path')
java_import(spark._jvm, 'org.apache.hadoop.fs.FileStatus')

# Get the Hadoop configuration
hadoop_conf = spark._jsc.hadoopConfiguration() # current fs.defaultFS is 'file:///' which pointing to local file system.
hadoop_conf.set("fs.defaultFS", "hdfs://localhost:9900") # This ensures that Spark uses HDFS instead of the local file system.

# Create a FileSystem object
fs = spark._jvm.FileSystem.get(hadoop_conf)

# Define the HDFS directory path
hdfs_directory_path = spark._jvm.Path("/")

# List the contents of the directory
file_statuses = fs.listStatus(hdfs_directory_path)

# Iterate through the statuses and print them
for file_status in file_statuses:
    print(file_status.getPath())

hdfs://localhost:9900/data
hdfs://localhost:9900/user


In [8]:
datalake_directory_path = spark._jvm.Path("/datalake")

# List the contents of the directory
file_statuses = fs.listStatus(datalake_directory_path)

# Iterate through the statuses and print them
for file_status in file_statuses:
    print(file_status.getPath())

Py4JJavaError: An error occurred while calling o52.listStatus.
: java.io.FileNotFoundException: File /datalake does not exist.
	at org.apache.hadoop.hdfs.DistributedFileSystem.listStatusInternal(DistributedFileSystem.java:1104)
	at org.apache.hadoop.hdfs.DistributedFileSystem.access$600(DistributedFileSystem.java:147)
	at org.apache.hadoop.hdfs.DistributedFileSystem$24.doCall(DistributedFileSystem.java:1175)
	at org.apache.hadoop.hdfs.DistributedFileSystem$24.doCall(DistributedFileSystem.java:1172)
	at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
	at org.apache.hadoop.hdfs.DistributedFileSystem.listStatus(DistributedFileSystem.java:1182)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)


In [None]:
customer_directory_path = spark._jvm.Path("/datalake/Customer")
# Check if the directory exists
exists = fs.exists(customer_directory_path)
exists

In [None]:
if exists:
    # Convert Path to string
    customer_directory_str = str(customer_directory_path.toString())
    print(customer_directory_str)
    datalake_customer_df = spark.read.parquet(customer_directory_str)
    datalake_customer_df.show(3)

In [None]:
datalake_customer_df.selectExpr("max(CustomerID)").collect()[0]

In [None]:
print(type(spark))