In [30]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import date_format
from pyspark.sql.types import *
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import shutil
init_notebook_mode(connected=True)
spark = SparkSession.builder.appName("Python Spark SQL basic example").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
df = sqlContext.read.load('/test_dev/mba-code/dataset.csv', format='csv', header='true', inferSchema='true')
df.select("InvoiceNo", "StockCode", "Description", "Quantity", "InvoiceDate","InvoiceDateWS" ,date_format('InvoiceDateWS', 'E').alias('weekday'),"CustomerID","Country").write.save("Invoices.parquet", format="parquet")
parquetFile = spark.read.parquet("Invoices.parquet")
# Parquet files can also be used to create a temporary view and then used in SQL statements.
parquetFile.createOrReplaceTempView("parquetFile")

In [17]:
DescriptionGrp = spark.sql("SELECT distinct Description,Count(Quantity) as QCount FROM parquetFile group by Description")
DescriptionGrp.coalesce(1).write.format("csv").option("header", "true").save("DescriptionGrp")

In [24]:
MostObjsPerDay = spark.sql("SELECT distinct Description,Count(Quantity) as QCount,InvoiceDateWS as PURCHASEDATE FROM parquetFile group by Description,InvoiceDateWS")
MostObjsPerDay.coalesce(1).write.format("csv").option("header", "true").save("MostObjsPerDay")

In [25]:
TopPerCust = spark.sql("SELECT distinct Description,Count(Quantity) as QCount,CustomerID FROM parquetFile group by Description,CustomerID order by QCount")
TopPerCust.coalesce(1).write.format("csv").option("header", "true").save("TopPerCust")

In [31]:
import calendar
WeekdayPerProduct = spark.sql("SELECT distinct Description,weekday,Count(Quantity) as QCount,CustomerID FROM parquetFile group by Description,CustomerID,weekday order by QCount")
WeekdayPerProduct.coalesce(1).write.format("csv").option("header", "true").save("WeekdayPerProduct")