# Dressmaker - Easy

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app17-1') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
jmcust = sc.read.table('sqlzoo.jmcust')
dressmaker = sc.read.table('sqlzoo.dressmaker')
dress_order = sc.read.table('sqlzoo.dress_order')
construction = sc.read.table('sqlzoo.construction')
quantities = sc.read.table('sqlzoo.quantities')
order_line = sc.read.table('sqlzoo.order_line')
garment = sc.read.table('sqlzoo.garment')
material = sc.read.table('sqlzoo.material')

## 1.
List the post code, order number, order date and garment descriptions for all items associated with Ms Brown.

In [3]:
t1 = (jmcust.filter(trim(jmcust['c_name'])=='Ms Brown')
      .join(dress_order, on=(jmcust['c_no']==dress_order['cust_no']))
      .join(order_line, on=(dress_order['order_no']==order_line['order_ref']))
      .join(garment, on=(order_line['ol_style']==garment['style_no']))
      .withColumnRenamed('c_post_code', 'post_code')
      .select('post_code', 'order_no', 'order_date', 'description'))
t2 = (dressmaker.filter(trim(dressmaker['d_name'])=='Ms Brown')
      .join(construction, on=(dressmaker['d_no']==construction['maker']))
      .join(dress_order, on=(construction['order_ref']==dress_order['order_no']))
      .join(order_line, on=(dress_order['order_no']==order_line['order_ref']))
      .join(garment, on=(order_line['ol_style']==garment['style_no']))
      .withColumnRenamed('d_post_code', 'post_code')
      .select('post_code', 'order_no', 'order_date', 'description'))
t1.union(t2).toPandas()

                                                                                

Unnamed: 0,post_code,order_no,order_date,description
0,B2 5AB,9,2002-02-27,Shorts
1,B2 5AB,7,2002-02-21,Suntop
2,B2 5AB,7,2002-02-21,Sundress
3,B2 5AB,7,2002-02-21,Short Skirt


## 2.
List the customer name, postal information, order date and order number of all orders that have been completed.

In [4]:
(jmcust.join(dress_order.filter(dress_order['completed']=='Y'),
             on=(jmcust['c_no']==dress_order['cust_no']))
 .join(order_line, on=(dress_order['order_no']==order_line['order_ref']))
 .select('c_name', 'c_post_code', 'order_date', 'order_no')
 .toPandas())

Unnamed: 0,c_name,c_post_code,order_date,order_no
0,Ms White,E24 8PQ,2002-02-03,5
1,Ms White,E24 8PQ,2002-02-03,5
2,Ms White,E24 8PQ,2002-02-03,5
3,Mr Brass,FG24 9NM,2002-02-02,4
4,Mr Brass,FG24 9NM,2002-02-02,4
5,Ms Muir,H2 7CV,2002-01-20,3
6,Dr Green,SJ4 4WE,2002-01-11,2
7,Dr Green,SJ4 4WE,2002-01-11,2
8,Dr Green,SJ4 4WE,2002-01-11,2
9,Mrs Peacock,DD6 9NM,2002-01-10,1


## 3.
Which garments have been made or are being made from 'red abstract' or 'blue abstract' coloured materials.

In [5]:
(garment.join(order_line, on=(garment['style_no']==order_line['ol_style']))
 .join(material.filter(lower(trim(material['colour'])).isin(
     'red abstract', 'blue abstract')),
     on=(order_line['ol_material']==material['material_no']))
 .select('style_no', 'description', 'colour')
 .toPandas())

Unnamed: 0,style_no,description,colour
0,1,Trousers,Red Abstract
1,2,Long Skirt,Blue Abstract
2,2,Long Skirt,Red Abstract
3,4,Short Skirt,Blue Abstract
4,5,Sundress,Red Abstract


## 4.
How many garments has each dressmaker constructed? You should give the number of garments and the name and postal information of each dressmaker.

In [6]:
(dressmaker.join(construction, on=(dressmaker['d_no']==construction['maker']))
 .join(dress_order, on=(construction['order_ref']==dress_order['order_no']))
 .join(order_line, on=(dress_order['order_no']==order_line['order_ref']))
 .join(garment, on=(order_line['ol_style']==garment['style_no']))
 .groupBy('d_no', 'd_name', 'd_post_code')
 .agg(count('line_no').alias('cnt'))
 .toPandas())

Unnamed: 0,d_no,d_name,d_post_code,cnt
0,5,Mr Seam,H45 7YH,12
1,6,Mr Taylor,SH6 9RT,7
2,7,Miss Pins,B4 9BL,11
3,3,Mr Needles,E12 6LG,12
4,1,Mrs Hem,A12 6BC,14
5,4,Ms Sew,EF7 9KL,12
6,2,Miss Stitch,DF4 7HJ,20


## 5.
Give the names of those dressmakers who have finished items made from silk for completed orders.

In [7]:
(dressmaker.join(construction, on=(dressmaker['d_no']==construction['maker']))
 .join(dress_order.filter(dress_order['completed']=='Y'), 
       on=(construction['order_ref']==dress_order['order_no']))
 .join(order_line, on=(dress_order['order_no']==order_line['order_ref']))
 .join(garment, on=(order_line['ol_style']==garment['style_no']))
 .join(material.filter(lower(trim(material['fabric']))=='silk'), 
        on=(order_line['ol_material']==material['material_no']))
 .select('d_name')
 .distinct()
 .toPandas())

Unnamed: 0,d_name
0,Miss Stitch
1,Mrs Hem
2,Mr Needles
3,Ms Sew
4,Miss Pins


In [8]:
sc.stop()