# AdventureWorks - Medium

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app14-2') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
cust_aw = sc.read.table('sqlzoo.CustomerAW')
cust_addr = sc.read.table('sqlzoo.CustomerAddress')
addr = sc.read.table('sqlzoo.Address')
product = sc.read.table('sqlzoo.Product')
order_det = sc.read.table('sqlzoo.SalesOrderDetail')
order_head = sc.read.table('sqlzoo.SalesOrderHeader')
prod_model = sc.read.table('sqlzoo.ProductModel')
prod_model_prod = sc.read.table('sqlzoo.ProductModelProductDescription')
prod_desc = sc.read.table('sqlzoo.ProductDescription')
prod_cat = sc.read.table('sqlzoo.ProductCategory')

## 6.
A "Single Item Order" is a customer order where only one item is ordered. Show the SalesOrderID and the UnitPrice for every Single Item Order.

In [3]:
(order_det.filter(col('OrderQty')==1)
 .groupBy('SalesOrderID', 'UnitPrice')
 .agg(count('SalesOrderDetailID').alias('n'))
 .filter(col('n')==1)
 .select('SalesOrderID', 'UnitPrice')
 .toPandas())

                                                                                

Unnamed: 0,SalesOrderID,UnitPrice
0,71796,31.58
1,71815,202.33
2,71784,200.05
3,71797,202.33
4,71782,31.58
...,...,...
77,71899,26.72
78,71832,26.72
79,71845,26.72
80,71845,54.89


## 7.
Where did the racing socks go? List the product name and the CompanyName for all Customers who ordered ProductModel 'Racing Socks'.

In [4]:
(product.join(order_det, on='ProductID')
 .join(order_head, on='SalesOrderID')
 .join(cust_aw, on='CustomerID')
 .join(prod_model.withColumnRenamed('Name', 'ModelName'), 
       on='ProductModelID')
 .filter(col('ModelName')=='Racing Socks')
 .select('CompanyName', 'Name')
 .distinct()
 .orderBy('CompanyName')
 .toPandas())

Unnamed: 0,CompanyName,Name
0,Eastside Department Store,"Racing Socks, L"
1,Essential Bike Works,"Racing Socks, L"
2,Remarkable Bike Store,"Racing Socks, L"
3,Remarkable Bike Store,"Racing Socks, M"
4,Riding Cycles,"Racing Socks, L"
5,Sports Products Store,"Racing Socks, M"
6,Sports Products Store,"Racing Socks, L"
7,The Bicycle Accessories Company,"Racing Socks, L"
8,The Bicycle Accessories Company,"Racing Socks, M"
9,Thrifty Parts and Sales,"Racing Socks, M"


## 8.
Show the product description for culture 'fr' for product with ProductID 736.

In [5]:
(product.join(prod_model_prod, 'ProductModelID')
 .join(prod_desc, 'ProductDescriptionID')
 .filter((col('ProductID')==736) & (col('Culture').like('%fr%')))
 .select('Description')
 .toPandas())

Unnamed: 0,Description
0,Le cadre LL en aluminium offre une conduite co...


## 9.
Use the SubTotal value in SaleOrderHeader to list orders from the largest to the smallest. For each order show the CompanyName and the SubTotal and the total weight of the order.

In [6]:
# a = (order_head.merge(order_det, on='SalesOrderID')
#      .merge(product, on='ProductID')
#      .merge(cust_aw, on='CustomerID'))
# a['Weight'] = a['Weight'].fillna(0) * a['OrderQty'].fillna(0)
# (a.groupby(['CompanyName', 'SubTotal'])['Weight'].sum()
#  .reset_index().sort_values('SubTotal', ascending=False))

(order_head.join(order_det, on='SalesOrderID')
 .join(product, on='ProductID')
 .join(cust_aw, on='CustomerID')
 .withColumn('Weight', col('Weight') * col('OrderQty'))
 .fillna(0, subset=['Weight'])
 .groupBy('CompanyName', 'SubTotal')
 .sum('Weight')
 .orderBy(col('SubTotal').desc())
 .toPandas()
)

Unnamed: 0,CompanyName,SubTotal,sum(Weight)
0,Action Bicycle Specialists,108561.83,1133911.56
1,Metropolitan Bicycle Supply,98278.69,679588.02
2,Bulk Discount Store,88812.86,34813.05
3,Eastside Department Store,83858.43,565638.72
4,Riding Cycles,78029.69,504095.33
5,Many Bikes Store,74058.81,744328.6
6,Instruments and Parts Company,63980.99,731576.77
7,Extreme Riding Supplies,57634.63,589939.11
8,Trailblazing Sports,41622.05,234328.12
9,Professional Sales and Service,39785.33,396843.63


## 10.
How many products in ProductCategory 'Cranksets' have been sold to an address in 'London'?

In [7]:
(product.join(prod_cat.filter(col('name')=='Cranksets'), 
               on='ProductCategoryID')
 .join(order_det, on='ProductID')
 .join(order_head, on='SalesOrderID')
 .join(cust_aw, on='CustomerID')
 .join(cust_addr, on='CustomerID')
 .join(addr.filter(addr['City']=='London'), on='AddressID')
 .groupBy()
 .count()
 .toPandas())

Unnamed: 0,count
0,2


In [8]:
sc.stop()