In [17]:
from pyspark.sql.session import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder.appName('orders_analysis').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/19 16:06:37 WARN Utils: Your hostname, Pulastyas-Mac-mini.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.133 instead (on interface en1)
26/01/19 16:06:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/19 16:06:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
orders = spark.read.options(header=True, inferSchema=True, sep='|')\
    .csv('data/input/orders.csv')
    
orders.show()

+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|invoiceno|stockcode|         description|quantity|invoicedate|unitprice|customerid|       country|
+---------+---------+--------------------+--------+-----------+---------+----------+--------------+
|   544586|    21890|S/6 WOODEN SKITTL...|       3| 2011-02-21|     2.95|     17338|United Kingdom|
|   541104|   84509G|SET OF 4 FAIRY CA...|       3| 2011-01-13|     3.29|      NULL|United Kingdom|
|   560772|    22499|WOODEN UNION JACK...|       3| 2011-07-20|     4.96|      NULL|United Kingdom|
|   555150|    22488|NATURAL SLATE REC...|       5| 2011-05-31|     3.29|      NULL|United Kingdom|
|   570521|    21625|VINTAGE UNION JAC...|       3| 2011-10-11|     6.95|     12371|   Switzerland|
|   547053|    22087|PAPER BUNTING WHI...|      40| 2011-03-20|     2.55|     13001|United Kingdom|
|   573360|    22591|CARDHOLDER GINGHA...|       6| 2011-10-30|     3.25|     15748|United Kingdom|


In [20]:
result = orders.filter(~F.col('invoiceno').startswith('C'))\
    .withColumns({'month': F.month('invoicedate'),
                  'sale_amt': F.round(F.col('quantity') * F.col('unitprice'), 2)})\
    .groupBy('month', 'description').agg(F.sum('sale_amt').alias('total_paid'))\
    .withColumn('rank', F.rank().over(Window.partitionBy('month').orderBy(F.col('total_paid').desc())))\
    .filter(F.col('rank') == 1)\
    .drop(F.col('rank'))\
    .withColumn('total_paid', F.regexp_replace(F.col('total_paid').cast('string'), r"\.0$", ""))


result.show(truncate=False)

+-----+----------------------------------+----------+
|month|description                       |total_paid|
+-----+----------------------------------+----------+
|1    |LUNCH BAG SPACEBOY DESIGN         |74.26     |
|2    |REGENCY CAKESTAND 3 TIER          |38.25     |
|3    |PAPER BUNTING WHITE LACE          |102       |
|4    |SPACEBOY LUNCH BOX                |23.4      |
|5    |PAPER BUNTING WHITE LACE          |51        |
|6    |Dotcomgiftshop Gift Voucher Â£50.00|41.67     |
|7    |PAPER BUNTING WHITE LACE          |56.1      |
|8    |LUNCH BAG PINK POLKADOT           |16.5      |
|9    |RED RETROSPOT PEG BAG             |34.72     |
|10   |CHOCOLATE HOT WATER BOTTLE        |102       |
|11   |RED WOOLLY HOTTIE WHITE HEART.    |228.25    |
|12   |PAPER BUNTING RETROSPOT           |35.4      |
+-----+----------------------------------+----------+



In [21]:
spark.stop()