# AdventureWorks - Hard

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app14-3') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
cust_aw = sc.read.table('sqlzoo.CustomerAW')
cust_addr = sc.read.table('sqlzoo.CustomerAddress')
addr = sc.read.table('sqlzoo.Address')
product = sc.read.table('sqlzoo.Product')
order_det = sc.read.table('sqlzoo.SalesOrderDetail')
order_head = sc.read.table('sqlzoo.SalesOrderHeader')
prod_model = sc.read.table('sqlzoo.ProductModel')
prod_model_prod = sc.read.table('sqlzoo.ProductModelProductDescription')
prod_desc = sc.read.table('sqlzoo.ProductDescription')
prod_cat = sc.read.table('sqlzoo.ProductCategory')

## 11.
**For every customer with a 'Main Office' in Dallas show AddressLine1 of the 'Main Office' and AddressLine1 of the 'Shipping' address - if there is no shipping address leave it blank. Use one row per customer.**

In [3]:
(cust_aw.join(cust_addr, 'CustomerID')
 .join(cust_aw
       .join(cust_addr.filter(cust_addr['AddressType']=='Main Office'),
             on='CustomerID')
       .join(addr.filter(addr['City']=='Dallas'), on='AddressID') 
       .select('CustomerID')
       .distinct(), on='CustomerID')
 .join(addr, on='AddressID')
 .select('CompanyName', 'AddressType', 'AddressLine1')
 .distinct()
 .groupBy('CompanyName')
 .pivot('AddressType', ['Main Office', 'Shipping'])
 .agg(first('AddressLine1'))
 .fillna('')
 .toPandas())

                                                                                

Unnamed: 0,CompanyName,Main Office,Shipping
0,Elite Bikes,Po Box 8259024,9178 Jumping St.
1,Rental Bikes,"99828 Routh Street, Suite 825",
2,Third Bike Store,2500 North Stemmons Freeway,
3,Town Industries,P.O. Box 6256916,
4,Unsurpassed Bikes,Po Box 8035996,


## 12.
**For each order show the SalesOrderID and SubTotal calculated three ways:**

- **A) From the SalesOrderHeader**
- **B) Sum of OrderQty*UnitPrice**
- **C) Sum of OrderQty*ListPrice**

In [4]:
(order_head.select('SalesOrderID', 'SubTotal')
 .join(order_det
       .withColumn('SubTotal', col('OrderQty') * col('UnitPrice') * 
                   (lit(1)-col('UnitPriceDiscount')))
       .groupBy(col('SalesOrderID'))
       .sum('SubTotal'),
      on='SalesOrderID')
 .join(order_det
       .join(product, on='ProductID')
       .withColumn('SubTotal', col('OrderQty') * col('ListPrice'))
       .groupBy('SalesOrderID')
       .sum('SubTotal'),
      on='SalesOrderID')
 .orderBy('SalesOrderID')
 .toPandas())


Unnamed: 0,SalesOrderID,SubTotal,sum(SubTotal),sum(SubTotal).1
0,71774,880.35,713.8,1189.66
1,71776,78.81,63.9,106.5
2,71780,38418.69,29922.81,56651.56
3,71782,39785.33,33319.68,55533.31
4,71783,83858.43,65682.7396,121625.43
5,71784,108561.83,89868.8795,151932.58
6,71796,57634.63,47848.02,79746.71
7,71797,78029.69,65122.7911,108986.4
8,71815,1141.58,926.91,1544.86
9,71816,3398.17,2847.37,4745.68


## 13.
**Show the best selling item by value.**

In [5]:
(order_det.join(product, on='ProductID')
 .withColumn('SubTotal', col('OrderQty') * col('UnitPrice'))
 .groupBy('ProductID', 'Name')
 .sum('SubTotal')
 .orderBy(col('Sum(SubTotal)').desc())
 .toPandas())

Unnamed: 0,ProductID,Name,sum(SubTotal)
0,969,"Touring-1000 Blue, 60",37191.44
1,783,"Mountain-200 Black, 42",37178.73
2,782,"Mountain-200 Black, 38",35801.74
3,976,"Road-350-W Yellow, 48",33509.58
4,957,"Touring-1000 Yellow, 60",23745.32
...,...,...,...
132,907,Rear Brakes,63.90
133,913,HL Road Seat/Saddle,63.16
134,874,"Racing Socks, M",59.29
135,947,HL Touring Handlebars,54.94


## 14.
**Show how many orders are in the following ranges (in $):**

```
    RANGE      Num Orders      Total Value
    0-  99
  100- 999
 1000-9999
10000-
```

In [6]:
(order_head
 .withColumn('RANGE', 
             when(col('SubTotal').between(0, 99.99), '    0-  99')
             .when(col('SubTotal').between(100, 999.99), '  100- 999')
             .when(col('SubTotal').between(1000, 9999.99), ' 1000-9999')
             .otherwise('10000-    '))
 .groupBy('RANGE')
 .agg(count('SubTotal'), sum('SubTotal'))
 .orderBy('RANGE')
 .toPandas())

Unnamed: 0,RANGE,count(SubTotal),sum(SubTotal)
0,0- 99,3,158.66
1,100- 999,5,2386.21
2,1000-9999,10,27561.43
3,10000-,14,835326.81


## 15.
**Identify the three most important cities. Show the break down of top level product category against city.**

In [7]:
(addr
 .join(addr
       .join(order_head, on=(addr['AddressID']==order_head['ShipToAddressID']))
       .groupBy('City')
       .sum('SubTotal')
       .orderBy(col('sum(SubTotal)').desc())
       .limit(3),
      on='City')
 .join(order_head, on=(addr['AddressID']==order_head['ShipToAddressID']))
 .join(order_det, on='SalesOrderID')
 .join(product, on='ProductID')
 .join(prod_cat.withColumnRenamed('name', 'catg_name'), on='ProductCategoryID')
 .withColumn('amount', col('OrderQty') * col('UnitPrice'))
 .groupBy('City', 'catg_name')
 .sum('amount')
 .orderBy('City', 'catg_name')
 .toPandas())

Unnamed: 0,City,catg_name,sum(amount)
0,London,Bottom Brackets,388.73
1,London,Brakes,255.6
2,London,Chains,36.42
3,London,Cranksets,1773.81
4,London,Derailleurs,638.85
5,London,Gloves,88.14
6,London,Handlebars,292.63
7,London,Helmets,20.99
8,London,Mountain Bikes,50881.99
9,London,Mountain Frames,24018.8


In [8]:
sc.stop()