In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext


print('Import successfull')

Import successfull


<u><h1>First Part</h1></u>
----------------------
very simple script to initialize a spark session and import a flat file.

In [4]:
# Init session
# Spark session creates entry point for application; lets you interact with Spark APIs
# getOrCreate() returns a new session if app exists or creates a new one
scSpark = SparkSession.builder.appName("reading csv").getOrCreate()

# read data
file_data = './Data/data.csv'
sdfData = scSpark.read.csv(file_data, header=True, sep=",").cache()
print(f'Total Records = {sdfData.count()}')
sdfData.show()

Total Records = 4
+------+---+--------+
|  name|age| country|
+------+---+--------+
| adnan| 40|Pakistan|
|  maaz|  9|Pakistan|
| musab|  4|Pakistan|
|ayesha| 32|Pakistan|
+------+---+--------+



<u><h1>Second Part</h1></u>
We will explore using SQL queries in Spark. We will be using real data which can be found on <a href=https://www.kaggle.com/aungpyaeap/supermarket-sales target=_blank>kaggle</a>.
<br>
<h2>Extract</h2>


In [19]:
# Extract the data
data_file = './Data/supermarket_sales - Sheet1.csv'
data = scSpark.read.csv(data_file, header=True, sep=",").cache()

#Explore the dataset
#Look at num of records
print(f'Total Records = {data.count()}')
#Look at schema
print('\nSchema')
print('--------------------------------------------------')
print(data.printSchema())
print('--------------------------------------------------\n')
print('Summary statistics')
print('--------------------------------------------------\n')
print(data.summary().show())

Total Records = 1000

Schema
--------------------------------------------------
root
 |-- Invoice ID: string (nullable = true)
 |-- Branch: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Customer type: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Unit price: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Tax 5%: string (nullable = true)
 |-- Total: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Payment: string (nullable = true)
 |-- cogs: string (nullable = true)
 |-- gross margin percentage: string (nullable = true)
 |-- gross income: string (nullable = true)
 |-- Rating: string (nullable = true)

None
--------------------------------------------------

Summary statistics
--------------------------------------------------

+-------+-----------+------+--------+-------------+------+--------------------+------------------+----

<h2>Transform</h2>

In [20]:
# group data by gender

gender = data.groupBy('Gender').count()
print(gender.show())

+------+-----+
|Gender|count|
+------+-----+
|Female|  501|
|  Male|  499|
+------+-----+

None


In [22]:
# Create temporary table of sales
# query the temporary table to select all fields

data.registerTempTable('sales')
output = scSpark.sql('SELECT * FROM sales')
output.show()

+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+---------+-----+-----------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity| Tax 5%|   Total|     Date| Time|    Payment|  cogs|gross margin percentage|gross income|Rating|
+-----------+------+---------+-------------+------+--------------------+----------+--------+-------+--------+---------+-----+-----------+------+-----------------------+------------+------+
|750-67-8428|     A|   Yangon|       Member|Female|   Health and beauty|     74.69|       7|26.1415|548.9715| 1/5/2019|13:08|    Ewallet|522.83|            4.761904762|     26.1415|   9.1|
|226-31-3081|     C|Naypyitaw|       Normal|Female|Electronic access...|     15.28|       5|   3.82|   80.22| 3/8/2019|10:29|       Cash|  76.4|            4.761904762|        3.82|   9.6|
|631-41-3108|     A|   Yangon|       Normal|  Male|  Ho

In [23]:
# Modify previous query to add WHERE clause

output2 = scSpark.sql('SELECT * \
                    FROM sales \
                    WHERE `Unit Price` < 15 AND Quantity < 10 ')
output2.show()

+-----------+------+---------+-------------+------+--------------------+----------+--------+------+--------+---------+-----+-----------+------+-----------------------+------------+------+
| Invoice ID|Branch|     City|Customer type|Gender|        Product line|Unit price|Quantity|Tax 5%|   Total|     Date| Time|    Payment|  cogs|gross margin percentage|gross income|Rating|
+-----------+------+---------+-------------+------+--------------------+----------+--------+------+--------+---------+-----+-----------+------+-----------------------+------------+------+
|351-62-0822|     B| Mandalay|       Member|Female| Fashion accessories|     14.48|       4| 2.896|  60.816| 2/6/2019|18:07|    Ewallet| 57.92|            4.761904762|       2.896|   4.5|
|871-39-9221|     C|Naypyitaw|       Normal|Female|Electronic access...|     12.45|       6| 3.735|  78.435| 2/9/2019|13:11|       Cash|  74.7|            4.761904762|       3.735|   4.1|
|586-25-0848|     A|   Yangon|       Normal|Female|   Sports

In [27]:
# Aggregate values

output3 = scSpark.sql('SELECT COUNT(*) as total, City FROM sales GROUP BY City')
output3.show()

+-----+---------+
|total|     City|
+-----+---------+
|  328|Naypyitaw|
|  332| Mandalay|
|  340|   Yangon|
+-----+---------+



<h2>Load</h2>

In [28]:
# Multiple files will be created
output3.write.format('json').save('filtered.json')

In [30]:
# to savve as only 1 file use coalesce
output3.coalesce(1).write.format('json').save('filtered_onefile.json')

# NEXT STEP:
connect to MySql database