# Dressmaker - Hard
You may need to create views to complete these questions - but you do not have permission to create tables or views in the default schema. Your SQL commands are executed by user scott in schema gisq - you may create or drop views and tables in schema scott but not in gisq.

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app17-3') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
jmcust = sc.read.table('sqlzoo.jmcust')
dressmaker = sc.read.table('sqlzoo.dressmaker')
dress_order = sc.read.table('sqlzoo.dress_order')
construction = sc.read.table('sqlzoo.construction')
quantities = sc.read.table('sqlzoo.quantities')
order_line = sc.read.table('sqlzoo.order_line')
garment = sc.read.table('sqlzoo.garment')
material = sc.read.table('sqlzoo.material')

## 1.
When creating a view in scott you must specify the schema name of the sources and the destination.

## 2.
It is decided to review the materials stock. How much did each material contribute to turnover in 2002?

In [3]:
(material
 .join(order_line, on=(material['material_no']==order_line['ol_material']))
 .join(quantities, on=((order_line['ol_size']==quantities['size_q']) &
                       (order_line['ol_style']==quantities['style_q'])))
 .join(dress_order.filter(year(dress_order['order_date'])==2002),
       on=(order_line['order_ref']==dress_order['order_no']))
 .withColumn('cost', col('cost') * col('quantity'))
 .groupBy('material_no', 'fabric', 'colour', 'pattern')
 .agg(sum('quantity'), sum('cost'))
 .toPandas())

                                                                                

Unnamed: 0,material_no,fabric,colour,pattern,sum(quantity),sum(cost)
0,7,Polyester,Pale Yellow,Printed,4.3,10.965
1,5,Cotton,Black Dotted,Woven,6.4,19.2
2,1,Silk,Black,Plain,4.9,34.3
3,6,Cotton,Red Stripe,Woven,2.2,6.6
4,8,Cotton,Blue Stripe,Woven,4.2,12.6
5,4,Cotton,Green Stripe,Woven,2.2,6.6
6,3,Cotton,Yellow Stripe,Woven,5.7,17.1
7,2,Silk,Red Abstract,Printed,9.3,93.0
8,10,Silk,Green Abstract,Printed,8.3,124.5
9,14,Cotton,Green Abstract,Printed,4.5,15.75


## 3.
An order for shorts has just been placed and the work is to be distributed amongst the workforce, and we wish to know how busy the shorts makers are. For each of the workers who have experience of making shorts show the number of hours work that she is currently committed to, assuming a meagre wage of £4.50 per hour

In [4]:
(dressmaker
 .join(dressmaker
        .join(construction, on=(dressmaker['d_no']==construction['maker']))
        .join(dress_order, on=(construction['order_ref']==dress_order['order_no']))
        .join(order_line, on=(dress_order['order_no']==order_line['order_ref']))
        .join(garment.filter(lower(trim(garment['description']))=='shorts'), 
              on=(order_line['ol_style']==garment['style_no']))
        .select('d_no')
        .distinct().alias('shorts'),
       on='d_no')
 .join(construction, on=(dressmaker['d_no']==construction['maker']))
 .join(dress_order.filter(dress_order['completed']=='N'), 
       on=(col('order_ref')==dress_order['order_no']))
 .join(order_line, on=(col('order_no')==order_line['order_ref']))
 .join(garment.withColumn('hrs', garment['labour_cost']/4.5),  
       on=(col('ol_style')==garment['style_no']))
 .join(quantities, on=((col('ol_size')==quantities['size_q']) &
                       (col('ol_style')==quantities['style_q'])))
 .groupBy('d_name')
 .sum('hrs')
 .toPandas())

Unnamed: 0,d_name,sum(hrs)
0,Mr Seam,28.166667
1,Ms Sew,18.611111
2,Miss Stitch,49.166667
3,Miss Pins,28.166667
4,Mr Needles,18.611111
5,Mr Taylor,18.611111


## 4.
"Big spender of the year" is the customer who spends the most on high value items. Identify the "Big spender of the year 2002" if the "high value" threshold is set at £30. Also who would it be if the threshold was £20 or £50?

In [5]:
from functools import reduce

t = (order_line
     .join(quantities, on=((order_line['ol_style']==quantities['style_q']) & 
                           (order_line['ol_size']==quantities['size_q'])))
     .join(garment, on=(order_line['ol_style']==garment['style_no']))
     .join(material, on=(order_line['ol_material']==material['material_no']))
     .withColumn('tot_cost', col('labour_cost') + col('quantity') * col('cost')))

c = (dress_order.filter(year(dress_order['order_date'])==2002)
     .join(jmcust, on=(dress_order['cust_no']==jmcust['c_no']))
     .join(t.select('order_ref', 'line_no', 'tot_cost'), 
           on=(col('order_no')==t['order_ref'])))

def find_big_spender(thres: float):
    ret = (c.filter(c['tot_cost']>=thres)
           .groupBy('c_name')
           .agg(sum('tot_cost').alias('tot_cost'))
           .orderBy(col('tot_cost').desc())
           .limit(1))
    return ret.select('c_name', 'tot_cost').withColumn('thres', lit(thres))

reduce(lambda x, y: x.union(y), 
       map(find_big_spender, [20, 30, 50])).toPandas()

Unnamed: 0,c_name,tot_cost,thres
0,Mr Brass,198.54,20
1,Ms White,173.55,30
2,Mr Brass,72.0,50


## 5.
Who is the fastest at making trousers?

In [6]:
(dressmaker.join(construction, on=(dressmaker['d_no']==construction['maker']))
 .join(order_line, on=((construction['order_ref']==order_line['order_ref']) & 
                       (construction['line_ref']==order_line['line_no'])))
 .join(garment.filter(lower(trim(garment['description']))=='trousers'), 
       on=(order_line['ol_style']==garment['style_no']))
 .withColumn('days', to_date(col('finish_date'))-to_date(col('start_date')))
 .select('d_no', 'd_name', 'days')
 .dropna(subset='days')
 .orderBy('days')
 .limit(1)
 .toPandas())

Unnamed: 0,d_no,d_name,days
0,3,Mr Needles,3 days


## 6.
"Employee of the month" is the seamstress who completes the greatest value of clothes. Show the "employees of the month" for months in 2002.

In [7]:
(material.join(order_line, on=(material['material_no']==order_line['ol_material']))
 .join(quantities, on=((order_line['ol_style']==quantities['style_q']) & 
                       (order_line['ol_size']==quantities['size_q'])))
 .join(garment, on=(order_line['ol_style']==garment['style_no']))
 .join(construction.filter(year(to_date(construction['start_date']))==2002), 
       on=((construction['order_ref']==order_line['order_ref']) & 
           (construction['line_ref']==order_line['line_no'])))
 .join(dressmaker, on=(construction['maker']==dressmaker['d_no']))
 .withColumn('month', month(to_date(col('start_date'))))
 .withColumn('val', col('quantity') * col('cost') + col('labour_cost'))
 .groupBy('d_name', 'month')
 .agg(sum('val').alias('val'))
 .withColumn('sn', rank().over(
     Window.partitionBy('month').orderBy(col('val').desc())))
 .filter(col('sn')==1)
 .select('d_name', 'month', 'val')
 .toPandas())

Unnamed: 0,d_name,month,val
0,Miss Stitch,1,49.0
1,Mrs Hem,2,122.25
2,Miss Stitch,3,97.2


In [8]:
sc.stop()