# Dressmaker - Medium

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window
å
sc = (SparkSession.builder.appName('app17-2') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
jmcust = sc.read.table('sqlzoo.jmcust')
dressmaker = sc.read.table('sqlzoo.dressmaker')
dress_order = sc.read.table('sqlzoo.dress_order')
construction = sc.read.table('sqlzoo.construction')
quantities = sc.read.table('sqlzoo.quantities')
order_line = sc.read.table('sqlzoo.order_line')
garment = sc.read.table('sqlzoo.garment')
material = sc.read.table('sqlzoo.material')

## 1.
Assuming that any garment could be made in any of the available materials, list the garments (description, fabric, colour and pattern) which are expensive to make, that is, those for which the labour costs are 80% or more of the total cost.

In [3]:
(garment.join(material)
 .filter(col('labour_cost')/(col('labour_cost')+col('cost'))>0.8) 
 .select('description', 'fabric', 'colour', 'pattern')
 .toPandas())

                                                                                

Unnamed: 0,description,fabric,colour,pattern
0,Trousers,Cotton,Yellow Stripe,Woven
1,Long Skirt,Cotton,Yellow Stripe,Woven
2,Short Skirt,Cotton,Yellow Stripe,Woven
3,Sundress,Cotton,Yellow Stripe,Woven
4,Trousers,Cotton,Green Stripe,Woven
5,Long Skirt,Cotton,Green Stripe,Woven
6,Short Skirt,Cotton,Green Stripe,Woven
7,Sundress,Cotton,Green Stripe,Woven
8,Trousers,Cotton,Black Dotted,Woven
9,Long Skirt,Cotton,Black Dotted,Woven


## 2.
List the descriptions and the number of orders of the less popular garments, that is those for which less than the average number of orders per garment have been placed. Also print out the average number of orders per garment. When calculating the average, ignore any garments for which no orders have been made.

In [4]:
(garment.join(order_line, on=(garment['style_no']==order_line['ol_style']))
 .groupBy('style_no', 'description')
 .agg(count('order_ref').alias('n_orders'))
 .withColumn('avg_orders', avg(col('n_orders')).over(Window.partitionBy(lit(0))))
 .filter(col('n_orders') < col('avg_orders'))
 .select('description', 'n_orders', 'avg_orders')
 .toPandas()
)

Unnamed: 0,description,n_orders,avg_orders
0,Sundress,5,5.166667
1,Short Skirt,5,5.166667
2,Suntop,4,5.166667
3,Shorts,5,5.166667


## 3.
Which is the most popular line, that is, the garment with the highest number of orders. Bearing in mind the fact that there may be several such garments, list the garment description(s) and number(s) of orders.

In [5]:
(garment.join(order_line, on=(garment['style_no']==order_line['ol_style']))
 .groupBy('style_no', 'description')
 .agg(count('order_ref').alias('n_orders'))
 .withColumn('max_orders', max('n_orders').over(Window.partitionBy(lit(0))))
 .filter(col('n_orders')==col('max_orders'))
 .select('description', 'n_orders')
 .toPandas())

Unnamed: 0,description,n_orders
0,Trousers,6
1,Long Skirt,6


## 4.
List the descriptions, and costs of the more expensive size 8, Cotton garments which might be ordered, that is those costing more than the average (labour costs + material costs) to make.

In [6]:
(garment.join(order_line, on=(garment['style_no']==order_line['ol_style']))
 .join(material, on=(order_line['ol_material']==material['material_no']))
 .join(quantities, on=((order_line['ol_size']==quantities['size_q']) &
                       (order_line['ol_style']==quantities['style_q'])))
 .withColumn('tot_cost', col('labour_cost') + col('quantity') * col('cost'))
 .withColumn('avg_cost', avg(col('tot_cost')).over(Window.partitionBy(lit(0))))
 .filter((col('ol_size') == 8) & (lower(trim(col('fabric'))) == 'cotton') &
         (col('tot_cost') > col('avg_cost')))
 .select('description', 'material_no', 'tot_cost')
 .toPandas())

Unnamed: 0,description,material_no,tot_cost
0,Sundress,14,31.2


## 5.
What is the most common size ordered for each garment type? List description, size and number of orders, assuming that there could be several equally popular sizes for each type.

In [7]:
(garment.join(order_line, on=(garment['style_no']==order_line['ol_style']))
 .groupBy('style_no', 'description', 'ol_size')
 .agg(count('order_ref').alias('n_orders'))
 .groupBy('style_no', 'description', 'ol_size')
 .max('n_orders')
 .orderBy('style_no', 'description', 'ol_size')
 .toPandas())

Unnamed: 0,style_no,description,ol_size,max(n_orders)
0,1,Trousers,8,3
1,1,Trousers,14,1
2,1,Trousers,16,1
3,1,Trousers,18,1
4,2,Long Skirt,8,3
5,2,Long Skirt,14,1
6,2,Long Skirt,16,1
7,2,Long Skirt,18,1
8,3,Shorts,8,1
9,3,Shorts,10,1


In [8]:
sc.stop()