In [2]:
import pyspark
from pyspark.sql import SparkSession
# Spark init
spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Load a Spark dataframe of USA country suppliers and their products for products that are not discontinued. Include supplier company and contact names, country and phone. From products include product name, discontinued, unit price and units in stock.

In [15]:
df = spark.read.csv("/home/jovyan/datasets/customers/customers.csv", header=True, inferSchema=True)

In [30]:
df.toPandas().head(5)

Unnamed: 0,First,Last,Email,Gender,Last IP Address,City,State,Total Orders,Total Purchased,Months Customer
0,Al,Fresco,afresco@dayrep.com,M,74.111.18.161,Syracuse,NY,1,45,1
1,Abby,Kuss,akuss@rhyta.com,F,23.80.125.101,Phoenix,AZ,1,25,2
2,Arial,Photo,aphoto@dayrep.com,F,24.0.14.56,Newark,NJ,1,680,1
3,Bette,Alott,balott@rhyta.com,F,56.216.127.219,Raleigh,NC,6,560,18
4,Barb,Barion,bbarion@superrito.com,F,38.68.15.223,Dallas,TX,4,1590,1


In [31]:
df.printSchema()

root
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Last IP Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Total Orders: integer (nullable = true)
 |-- Total Purchased: integer (nullable = true)
 |-- Months Customer: integer (nullable = true)



In [32]:
df.createOrReplaceTempView("fm_customers")

In [33]:
spark.sql("show tables").show()

+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|         |   customers|       true|
|         |fm_customers|       true|
+---------+------------+-----------+



In [35]:
spark.sql("show columns in fm_customers").show()

+---------------+
|       col_name|
+---------------+
|          First|
|           Last|
|          Email|
|         Gender|
|Last IP Address|
|           City|
|          State|
|   Total Orders|
|Total Purchased|
|Months Customer|
+---------------+



In [38]:
spark.sql("select * from fm_customers").show(5)

+-----+------+--------------------+------+---------------+--------+-----+------------+---------------+---------------+
|First|  Last|               Email|Gender|Last IP Address|    City|State|Total Orders|Total Purchased|Months Customer|
+-----+------+--------------------+------+---------------+--------+-----+------------+---------------+---------------+
|   Al|Fresco|  afresco@dayrep.com|     M|  74.111.18.161|Syracuse|   NY|           1|             45|              1|
| Abby|  Kuss|     akuss@rhyta.com|     F|  23.80.125.101| Phoenix|   AZ|           1|             25|              2|
|Arial| Photo|   aphoto@dayrep.com|     F|     24.0.14.56|  Newark|   NJ|           1|            680|              1|
|Bette| Alott|    balott@rhyta.com|     F| 56.216.127.219| Raleigh|   NC|           6|            560|             18|
|Barb |Barion|bbarion@superrito...|     F|   38.68.15.223|  Dallas|   TX|           4|           1590|              1|
+-----+------+--------------------+------+------

In [43]:
query = """
with source as (
    select State, `Total Orders` as total_orders, Gender
    from fm_customers
)
select State, sum(total_orders) as total_orders
    from source
    where gender = 'F'
    group by State
"""
spark.sql(query).show()

+-----+------------+
|State|total_orders|
+-----+------------+
|   AZ|           1|
|   NJ|           1|
|   VA|           8|
|   CA|           8|
|   NC|           6|
|   ME|           1|
|   OH|           9|
|   NY|          20|
|   TX|           4|
|   FL|           5|
+-----+------------+

