# **PySpark Ingestion + Egress + Dataloading Techniques**

In [16]:
from pyspark.sql import SparkSession

#MySql jdbc connector jar local path
mysql_connector_jar_path = "/home/hduser/install/mysql-connector-java.jar"

#Spark Session Creation
spark =  SparkSession.builder\
    .appName("Spark-Ingress-Egress-Dataloading-Practice")\
    .config("spark.jars", mysql_connector_jar_path) \
    .getOrCreate()

print(f"[INFO] SparkSession Object Memory Reference: {spark}")

[INFO] SparkSession Object Memory Reference: <pyspark.sql.session.SparkSession object at 0xffff7610b8e0>


## **1. Reading a CSV data and write into MySql Database using JDBC Option**

In [17]:
###### Reading CSV data and write into DataFrame #######

# Sample Customer Info Data
"""
cd /home/hduser/custinfo.csv

4000001,Kristina,Chung,55,Pilot
4000002,Paige,Chen,77,Teacher
4000003,Sherri,Melton,34,Firefighter
4000004,Gretchen,Hill,66,Computer hardware engineer
4000005,Karen,Puckett,74,Lawyer
"""
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Schema Definition
custinfo_schema = StructType([StructField('custid', IntegerType(), True), StructField('first_name', StringType(), True), StructField('last_name', StringType(), True), StructField('age', IntegerType(), True), StructField('profession', StringType(), True)])

# CSV Data Read and storing it in DataFrame
df1 = spark.read.csv(path="file:///home/hduser/custinfo.csv",header=False,sep=",",inferSchema=False,schema=custinfo_schema)
df1.show(truncate=False,n=5)
print(f"[INFO] df1.count() = {df1.count()}")


+-------+----------+---------+---+--------------------------+
|custid |first_name|last_name|age|profession                |
+-------+----------+---------+---+--------------------------+
|4000001|Kristina  |Chung    |55 |Pilot                     |
|4000002|Paige     |Chen     |77 |Teacher                   |
|4000003|Sherri    |Melton   |34 |Firefighter               |
|4000004|Gretchen  |Hill     |66 |Computer hardware engineer|
|4000005|Karen     |Puckett  |74 |Lawyer                    |
+-------+----------+---------+---+--------------------------+
only showing top 5 rows

[INFO] df1.count() = 9999


In [18]:
###### Write the data into MySql DB ######

# JDBC Options
url1='jdbc:mysql://127.0.0.1:3306/stocksdb?createDatabaseIfNotExist=true'
dbproperties={'user':'root','password':'Root123$','driver':'com.mysql.cj.jdbc.Driver'}

# Write into DB
df1.write.jdbc(url=url1,properties=dbproperties,table="custinfo",mode="overwrite")
print("[INFO] CSV file data write into MySQL DB is successful.")


[INFO] CSV file data write into MySQL DB is successful.


In [19]:
###### Simple way to read the data from MySql/RDBMS DB using JDBC ######

# JDBC Options
url1='jdbc:mysql://127.0.0.1:3306/stocksdb'
dbproperties={'user':'root','password':'Root123$','driver':'com.mysql.cj.jdbc.Driver'}

# Read the data from RDBMS using query instead of direct table
table_query = "(select * from stocksdb.custinfo) as tablename"
df2_db = spark.read.jdbc(url=url1,properties=dbproperties,table=table_query)
df2_db.cache()
df2_db.show(truncate=False,n=5)

+-------+----------+---------+---+--------------------------+
|custid |first_name|last_name|age|profession                |
+-------+----------+---------+---+--------------------------+
|4000001|Kristina  |Chung    |55 |Pilot                     |
|4000002|Paige     |Chen     |77 |Teacher                   |
|4000003|Sherri    |Melton   |34 |Firefighter               |
|4000004|Gretchen  |Hill     |66 |Computer hardware engineer|
|4000005|Karen     |Puckett  |74 |Lawyer                    |
+-------+----------+---------+---+--------------------------+
only showing top 5 rows



In [29]:
###### Optimized way to read the data from any RDBMS DB using JDBC ######

#Question: How to improve performance for JDBC?
#partition, fetchsize, caching, pushdown optimization etc.,
#partitionColumn:, numberOfPartitions:, upperBound:, lowerBound, predicates, fetchsize..

# JDBC Options for performance optimization
url1='jdbc:mysql://127.0.0.1:3306/stocksdb'
dbproperties = {
    'user': 'root',
    'password': 'Root123$',
    'driver': 'com.mysql.cj.jdbc.Driver',
    # Performance optimization options (values as strings):
    'partitionColumn': 'custid',
    'lowerBound': '4000001',  # Column used to divide data into sections for parallel processing.
    'upperBound': '4000100',  # Minimum value for the partition column to start reading data.
    'numPartitions': '3',     # Maximum value for the partition column to start reading data.
    'pushDownPredicate': 'true',  # Sends filters (WHERE clauses) to the database for early processing.
    'pushDownAggregate': 'true',  # Sends aggregations (SUM, COUNT) to the database for early processing.
    'queryTimeout': '120',    # Maximum time (in seconds) a database query can run before timing out.
    'fetchSize': '10',        # Number of rows retrieved from the database in each batch.
    'isolationLevel': 'READ_COMMITTED' # Ensures only committed data is visible during a transaction.
}

# Read the data from RDBMS using query instead of direct table
table_query = "(select * from stocksdb.custinfo) as tablename"
df2_db = spark.read.jdbc(url=url1,properties=dbproperties,table=table_query)
df2_db.show(truncate=False,n=5)

+-------+----------+---------+---+--------------------------+
|custid |first_name|last_name|age|profession                |
+-------+----------+---------+---+--------------------------+
|4000001|Kristina  |Chung    |55 |Pilot                     |
|4000002|Paige     |Chen     |77 |Teacher                   |
|4000003|Sherri    |Melton   |34 |Firefighter               |
|4000004|Gretchen  |Hill     |66 |Computer hardware engineer|
|4000005|Karen     |Puckett  |74 |Lawyer                    |
+-------+----------+---------+---+--------------------------+
only showing top 5 rows

