# Pyspark MySQL

In [38]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
.config("spark.driver.extraClassPath", "C:/Users/malam/jdbc/mysql.jar") \
.appName("pyspark-mysql").master("local[*]").getOrCreate()

# Reading database details


In [39]:
from jproperties import Properties
  
configs = Properties()
with open('C:\\Users\\malam\\jdbc\\mysql.properties', 'rb') as read_prop:
    configs.load(read_prop)
      
jdbc_url=configs.get("jdbcUrl").data 
jdbc_user=configs.get("user").data
jdbc_password=configs.get("password").data
jdbc_driver=configs.get("driver").data

# Reading the data from database

In [40]:
reader_no_partitioning = spark.read\
    .format("jdbc")\
    .option("url", jdbc_url)\
    .option("user", jdbc_user)\
    .option("password", jdbc_password)\
    .option("ssl", "true")\
    .option("driver", jdbc_driver)\
    .option("dbtable", "emp").load()

In [41]:
reader_no_partitioning.printSchema()

root
 |-- empd_id: integer (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- dept_id: integer (nullable = true)



In [42]:
reader_no_partitioning.cache().count()
reader_no_partitioning.unpersist()

DataFrame[empd_id: int, emp_name: string, dept_id: int]

# skewed column to partition

In [43]:
reader_partitioning_skewed = spark.read\
    .format("jdbc")\
    .option("url", jdbc_url)\
    .option("user", jdbc_user)\
    .option("password", jdbc_password)\
    .option("ssl", "true")\
    .option("driver", jdbc_driver)\
    .option("partitionColumn", "empd_id")\
    .option("numPartitions", 8)\
    .option("lowerBound", 0)\
    .option("upperBound", 4)\
    .option("dbtable", "emp")


In [44]:
df_partitioning_skewed = reader_partitioning_skewed.load()

In [45]:
df_partitioning_skewed.cache().count()

4

In [18]:
df_partitioning_skewed.unpersist()

DataFrame[empd_id: int, emp_name: string, dept_id: int]

# Uniform partition

In [34]:
reader_partitioning_unif = spark.read\
    .format("jdbc")\
    .option("url", jdbc_url)\
    .option("user", jdbc_user)\
    .option("password", jdbc_password)\
    .option("ssl", "true")\
    .option("driver", jdbc_driver)\
    .option("partitionColumn", "empd_id")\
    .option("numPartitions", 8)\
    .option("lowerBound", 0)\
    .option("upperBound", 8)\
    .option("dbtable", "emp")

In [35]:
df_partitioning_unif = reader_partitioning_unif.load()

In [36]:
df_partitioning_unif.cache().count()

4

In [22]:
df_partitioning_unif.unpersist()

DataFrame[empd_id: int, emp_name: string, dept_id: int]

# Writing to MySQL database

In [27]:
data = [(1,'mahfooz',1),
        (2,'hamdan',1),
        (3,'hadiya',2),
        (4,'shaziya',2)
       ]

df=spark.createDataFrame(data,["empd_id","emp_name","dept_id"])


In [33]:
df.write.format("jdbc")\
    .mode("append")\
    .option("url", jdbc_url)\
    .option("user", jdbc_user)\
    .option("password", jdbc_password)\
    .option("ssl", "true")\
    .option("driver", jdbc_driver)\
    .option("dbtable", "emp").save()

# Push down a query to the database engine
You can push down an entire query to the database and return just the result. 

In [51]:
pushdown_query = "(select * from emp where empd_id < 10)"
df = spark.read.format("jdbc")\
    .option("url", jdbc_url)\
    .option("user", jdbc_user)\
    .option("password", jdbc_password)\
    .option("ssl", "true")\
    .option("driver", jdbc_driver)\
    .option("query", pushdown_query).load()

In [52]:
df.show()

+-------+--------+-------+
|empd_id|emp_name|dept_id|
+-------+--------+-------+
|      1| mahfooz|      1|
|      2|  hamdan|      1|
|      3|  hadiya|      2|
|      4| shaziya|      2|
+-------+--------+-------+



# Manage parallelism

# Read from JDBC connections across multiple workers

In order to read data in parallel, the Spark JDBC data source must be configured with appropriate partitioning information so that it can issue multiple concurrent queries to the external database. If you neglect to configure partitioning, all data will be fetched on the driver using a single JDBC query which runs the risk of causing the driver to throw an OOM exception.

In [55]:
connectionProperties = {
  "user" : jdbc_user,
  "password" : jdbc_password,
  "driver" : "com.mysql.jdbc.Driver"
}

df = spark.read.jdbc(url=jdbc_url, table="emp", column="empd_id", properties=connectionProperties, lowerBound=1, upperBound=100000, numPartitions=100)

DataFrame[empd_id: int, emp_name: string, dept_id: int]

In [56]:
df.show()

+-------+--------+-------+
|empd_id|emp_name|dept_id|
+-------+--------+-------+
|      1| mahfooz|      1|
|      2|  hamdan|      1|
|      3|  hadiya|      2|
|      4| shaziya|      2|
+-------+--------+-------+



# Tune the JDBC fetchSize parameter

JDBC drivers have a fetchSize parameter that controls the number of rows fetched at a time from the remote JDBC database. If this value is set too low then your workload may become latency-bound due to a high number of roundtrip requests between Spark and the external database in order to fetch the full result set. If this value is too high you risk OOM exceptions. 

# Consider the impact of indexes
If you are reading in parallel (using one of the partitioning techniques) Spark issues concurrent queries to the JDBC database. If these queries end up requiring full table scans this could end up bottlenecking in the remote database and become extremely slow. Thus you should consider the impact of indexes when choosing a partitioning column and pick a column such that the individual partitions’ queries can be executed reasonably efficiently in parallel.

**Make sure that the database has an index on the partitioning column.**

# Consider whether the number of partitions is appropriate

Using too many partitions when reading from the external database risks overloading that database with too many queries. Most DBMS systems have limits on the concurrent connections.

As a starting point, aim to have the number of partitions be close to the number of cores or task slots in your Spark cluster in order to maximize parallelism but keep the total number of queries capped at a reasonable limit. If you need lots of parallelism after fetching the JDBC rows (because you’re doing something CPU bound in Spark) but don’t want to issue too many concurrent queries to your database then consider using a lower numPartitions for the JDBC read and then doing an explicit repartition() in Spark.

# Consider database-specific tuning techniques
The database vendor may have a guide on tuning performance for ETL and bulk access workloads.

In [57]:
spark.stop()