In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("HelloSparkSession")

from pyspark.sql import SparkSession
# spark Session, entry point for Spark SQL, DataFrame
spark = SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc = spark.sparkContext

In [4]:
sectorDf =  ( spark.read
                    .format("csv")
                    .option("header", True)
                    .option("inferSchema", True)
                    .option("delimitter", ",")
                    .load("hdfs://localhost:9000/stocks/sectors")
            )
sectorDf.printSchema()
sectorDf.show(2)

root
 |-- Company Name: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN Code: string (nullable = true)

+------------------+------------------+----------+------+------------+
|      Company Name|          Industry|    Symbol|Series|   ISIN Code|
+------------------+------------------+----------+------+------------+
|    Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
+------------------+------------------+----------+------+------------+
only showing top 2 rows



In [18]:
# // Write dataframe into mysql
# // mode - overwrite - sectors/existing  table, data shall be deleted
# // mode - append - add records to existing table
# // mode - ignore - if table already exist, silently ignore no write
#                    if table not exist, it creates and insert record
# // mode - error , throw error if table already has data
#                   if no table exist, then it create table, insert record
# // new table created, records shall be inserted
# // "jdbc" - write to connect to db
# // driver - mysql class name
# // user - mysql user name
# // password - mysql password
# // dbtable - mysql table name
( sectorDf
.withColumnRenamed("Company Name", "Company")
.withColumnRenamed("ISIN Code", "ISIN")
.write
 .mode("error")
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/stockdb")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "team")
.option("password", "Team1234!")
.option("dbtable", "sectors_pyspark")
 .save()
)

In [7]:
# in Mysql
# SELECT * FROM sectors_pyspark

In [9]:
# // Now read data from mysql to DataFrame
# // this will create schema automatically because database
# // is structured already, it has table, column, data type etc
# // Spark will read scheme definition from mysql and create dataframe
# // automatically
sectorDataFromMySqlDf = ( spark.read
.format("jdbc")
.option("url", "jdbc:mysql://localhost:3306/stockdb")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "team")
.option("password", "Team1234!")
.option("dbtable", "sectors_pyspark")
 .load()
 )

sectorDataFromMySqlDf.printSchema()
sectorDataFromMySqlDf.show(5)

root
 |-- Company: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Series: string (nullable = true)
 |-- ISIN: string (nullable = true)

+--------------------+------------------+----------+------+------------+
|             Company|          Industry|    Symbol|Series|        ISIN|
+--------------------+------------------+----------+------+------------+
|      Axis Bank Ltd.|FINANCIAL SERVICES|  AXISBANK|    EQ|INE238A01034|
|  Bajaj Finance Ltd.|FINANCIAL SERVICES|BAJFINANCE|    EQ|INE296A01024|
|  Bajaj Finserv Ltd.|FINANCIAL SERVICES|BAJAJFINSV|    EQ|INE918I01018|
|Cholamandalam Inv...|FINANCIAL SERVICES|  CHOLAFIN|    EQ|INE121A01024|
|HDFC Asset Manage...|FINANCIAL SERVICES|   HDFCAMC|    EQ|INE127D01025|
+--------------------+------------------+----------+------+------------+
only showing top 5 rows

