In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    "Basics"
).getOrCreate()
spark # verify if spark session exists

In [2]:
df = spark.read.load("data/online_retail.csv",format="csv",header="true",inferSchema="true")

In [3]:
df.show(10)

+---------+---------+--------------------+--------+------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity| InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/10 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/10 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/10 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/10 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/10 8:26|     4.25|     17850|United

In [4]:
# Show the dataframe/table schema
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [5]:
df.select("Country").distinct().show()

+---------------+
|        Country|
+---------------+
|         Sweden|
|        Germany|
|         France|
|        Belgium|
|        Finland|
|          Italy|
|           EIRE|
|      Lithuania|
|         Norway|
|          Spain|
|        Denmark|
|        Iceland|
|         Israel|
|Channel Islands|
|         Cyprus|
|    Switzerland|
|          Japan|
|         Poland|
|       Portugal|
|      Australia|
+---------------+
only showing top 20 rows



In [6]:
# Aggregation
df.select(df["InvoiceNo"],df["UnitPrice"]*df["Quantity"]).groupBy("InvoiceNo").sum().show()

+---------+---------------------------+
|InvoiceNo|sum((UnitPrice * Quantity))|
+---------+---------------------------+
|   536596|                      38.09|
|   536938|         1680.8799999999999|
|   537252|         26.349999999999998|
|   537691|                     310.57|
|   538041|                        0.0|
|   538184|         458.91999999999985|
|   538517|         320.28000000000003|
|   538879|          338.9799999999999|
|   539275|         403.79999999999995|
|   539630|                      751.0|
|   540499|                      365.2|
|   540540|                     169.35|
|   536597|                     102.79|
|   536608|                       38.1|
|   536765|                        0.0|
|   537439|                        0.0|
|   537631|                       17.4|
|   537870|                        0.0|
|   538142|                        0.0|
|   538177|          6272.010000000006|
+---------+---------------------------+
only showing top 20 rows



In [7]:
# fitler data
df.filter(df["InvoiceNo"]==536415).show(10)

+---------+---------+--------------------+--------+-------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|  InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------+---------+----------+--------------+
|   536415|    22952|60 CAKE CASES VIN...|      10|12/1/10 11:57|     0.55|     12838|United Kingdom|
|   536415|    22910|PAPER CHAIN KIT V...|       5|12/1/10 11:57|     2.95|     12838|United Kingdom|
|   536415|    22739|RIBBON REEL CHRIS...|       3|12/1/10 11:57|     1.65|     12838|United Kingdom|
|   536415|    22738|RIBBON REEL SNOWY...|       3|12/1/10 11:57|     1.65|     12838|United Kingdom|
|   536415|    22736|RIBBON REEL MAKIN...|       3|12/1/10 11:57|     1.65|     12838|United Kingdom|
|   536415|    22909|SET OF 20 VINTAGE...|       5|12/1/10 11:57|     0.85|     12838|United Kingdom|
|   536415|   35591T|TURQUOISE CHRISTM...|       2|12/1/10 11:57|     1.25|     12

In [8]:
df.select(df["Country"], df["Description"],(df["UnitPrice"]*df["Quantity"]).alias("Total")).groupBy("Country", "Description").sum().filter(df["Country"]=="United Kingdom").sort("sum(Total)", ascending=False).limit(10).show()

+--------------+--------------------+------------------+
|       Country|         Description|        sum(Total)|
+--------------+--------------------+------------------+
|United Kingdom|      DOTCOM POSTAGE| 34177.85999999999|
|United Kingdom|REGENCY CAKESTAND...|30512.560000000027|
|United Kingdom|WHITE HANGING HEA...|22248.690000000024|
|United Kingdom|       CHILLI LIGHTS|12475.610000000002|
|United Kingdom|RED WOOLLY HOTTIE...| 9355.869999999997|
|United Kingdom|PAPER CHAIN KIT 5...| 9313.069999999996|
|United Kingdom|WHITE SKULL HOT W...| 8867.309999999998|
|United Kingdom|HEART OF WICKER L...| 8175.289999999995|
|United Kingdom|HOT WATER BOTTLE ...| 7946.580000000001|
|United Kingdom|CHOCOLATE HOT WAT...| 7825.719999999996|
+--------------+--------------------+------------------+

