# AdventureWorks: resit questions

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app14-4') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
cust_aw = sc.read.table('sqlzoo.CustomerAW')
cust_addr = sc.read.table('sqlzoo.CustomerAddress')
addr = sc.read.table('sqlzoo.Address')
product = sc.read.table('sqlzoo.Product')
order_det = sc.read.table('sqlzoo.SalesOrderDetail')
order_head = sc.read.table('sqlzoo.SalesOrderHeader')
prod_model = sc.read.table('sqlzoo.ProductModel')
prod_model_prod = sc.read.table('sqlzoo.ProductModelProductDescription')
prod_desc = sc.read.table('sqlzoo.ProductDescription')
prod_cat = sc.read.table('sqlzoo.ProductCategory')

## 1.
**List the SalesOrderNumber for the customer 'Good Toys' 'Bike World'**

In [3]:
(order_head
 .join(cust_aw
       .filter(cust_aw['CompanyName'].isin(['Good Toys', 'Bike World'])),
       on='CustomerID', how='right')
 .select('SalessOrderNumber', 'CompanyName')
 .toPandas())

                                                                                

Unnamed: 0,SalessOrderNumber,CompanyName
0,,Bike World
1,SO71774,Good Toys


## 2.
**List the ProductName and the quantity of what was ordered by 'Futuristic Bikes'**

In [4]:
(product.join(order_det, on='ProductID', how='right')
 .join(order_head, on='SalesOrderID', how='right')
 .join(cust_aw.filter(cust_aw['CompanyName']=='Futuristic Bikes'), 
        on='CustomerID', how='right')
 .select('Name', 'OrderQty')
 .toPandas())

Unnamed: 0,Name,OrderQty
0,ML Mountain Seat/Saddle,2
1,"Long-Sleeve Logo Jersey, L",2
2,"Classic Vest, S",3


## 3.
**List the name and addresses of companies containing the word 'Bike' (upper or lower case) and companies containing 'cycle' (upper or lower case). Ensure that the 'bike's are listed before the 'cycles's.**

In [5]:
(cust_aw.join(cust_addr, on='CustomerID')
 .join(addr, on='AddressID')
 .select('CompanyName', 'AddressLine1', 'AddressLine2', 'City',
       'StateProvince', 'CountryRegion', 'PostalCode')
 .withColumn('flag', when(lower(col('CompanyName')).like('%bike%'), 'bike')
             .when(lower(col('CompanyName')).like('%cycle%'), 'cycle'))
 .dropna(subset=['flag'])
 .orderBy('flag', 'CompanyName')
 .toPandas())

Unnamed: 0,CompanyName,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,flag
0,A Bike Store,2251 Elliot Avenue,,Seattle,Washington,United States,98104,bike
1,A Typical Bike Shop,"One Dancing, Rr",No. 25 Box 8033,Round Rock,Texas,United States,78664,bike
2,Advanced Bike Components,12345 Sterling Avenue,,Irving,Texas,United States,75061,bike
3,Area Bike Accessories,6900 Sisk Road,,Modesto,California,United States,95354,bike
4,Associated Bikes,5420 West 22500 South,,Salt Lake City,Utah,United States,84101,bike
...,...,...,...,...,...,...,...,...
187,"Unicycles, Bicycles, and Tricycles",Stonewood Mall,,Downey,California,United States,90241,cycle
188,Valley Bicycle Distributors,5867 Sunrise Boulevard,,Citrus Heights,California,United States,95610,cycle
189,Valley Bicycle Specialists,Blue Ridge Mall,,Kansas City,Missouri,United States,64106,cycle
190,Westside Cycle Store,25550 Executive Dr,,Elgin,Illinois,United States,60120,cycle


## 4.
**Show the total order value for each CountryRegion. List by value with the highest first.**

In [6]:
(addr.join(order_head, on=(addr['AddressID']==order_head['ShipToAddressID']))
 .groupby('CountryRegion')
 .sum('SubTotal')
 .orderBy(col('sum(SubTotal)').desc())
 .toPandas())

Unnamed: 0,CountryRegion,sum(SubTotal)
0,United Kingdom,518096.42
1,United States,347336.69


## 5.
**Find the best customer in each region.**

In [7]:
(addr.join(order_head, on=(addr['AddressID']==order_head['ShipToAddressID']))
 .join(cust_aw, on='CustomerID')
 .groupBy('CountryRegion', 'CompanyName')
 .sum('SubTotal')
 .withColumn('sn', rank().over(
     Window.partitionBy('CountryRegion').orderBy(col('sum(SubTotal)').desc())))
 .filter(col('sn')==1)
 .select('CountryRegion', 'CompanyName', 'sum(SubTotal)')
 .toPandas())

Unnamed: 0,CountryRegion,CompanyName,sum(SubTotal)
0,United Kingdom,Action Bicycle Specialists,108561.83
1,United States,Eastside Department Store,83858.43


In [8]:
sc.stop()