# Guest House - Hard

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app13-3') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
booking = sc.read.table('sqlzoo.booking')
guest = sc.read.table('sqlzoo.guest')
room = sc.read.table('sqlzoo.room')
rate = sc.read.table('sqlzoo.rate')
extra = sc.read.table('sqlzoo.extra')

## 11.
Coincidence. Have two guests with the same surname ever stayed in the hotel on the evening? Show the last name and both first names. Do not include duplicates.

```
+-----------+------------+-------------+
| last_name | first_name | first_name  |
+-----------+------------+-------------+
| Davies    | Philip     | David T. C. |
| Evans     | Graham     | Mr Nigel    |
| Howarth   | Mr George  | Sir Gerald  |
| Jones     | Susan Elan | Mr Marcus   |
| Lewis     | Clive      | Dr Julian   |
| McDonnell | John       | Dr Alasdair |
+-----------+------------+-------------+
```

In [3]:
t = (guest.join(booking, on=(guest['id']==booking['guest_id']))
     .select('last_name', 'first_name', 'booking_date', 'nights', 'id')
     .withColumn('checkout', date_add(col('booking_date'), col('nights')-lit(1))))

a = (t.join(t.alias('t1'), 'last_name', how='full')
     .toDF('last_name', 'first_name', 'booking_date', 'nights', 'id', 'checkout',
           'first_name1', 'booking_date1', 'nights1', 'id1', 'checkout1')
     .filter(col('first_name') != col('first_name1')))

(a.filter((((col('booking_date') >= col('booking_date1')) & 
            (col('booking_date') <= col('checkout1'))) |
          ((col('booking_date1') >= col('booking_date')) & 
            (col('booking_date1') <= col('checkout')))) &
          (col('id') >= col('id1')))
 .select('last_name', 'first_name1', 'first_name')
 .distinct()
 .orderBy('last_name')
 .toPandas())

                                                                                

Unnamed: 0,last_name,first_name1,first_name
0,Davies,Philip,David T. C.
1,Evans,Graham,Mr Nigel
2,Howarth,Mr George,Sir Gerald
3,Jones,Susan Elan,Mr Marcus
4,Lewis,Clive,Dr Julian
5,McDonnell,John,Dr Alasdair


## 12.
Check out per floor. The first digit of the room number indicates the floor – e.g. room 201 is on the 2nd floor. For each day of the week beginning 2016-11-14 show how many rooms are being vacated that day by floor number. Show all days in the correct order.

```
+------------+-----+-----+-----+
| i          | 1st | 2nd | 3rd |
+------------+-----+-----+-----+
| 2016-11-14 |   5 |   3 |   4 |
| 2016-11-15 |   6 |   4 |   1 |
| 2016-11-16 |   2 |   2 |   4 |
| 2016-11-17 |   5 |   3 |   6 |
| 2016-11-18 |   2 |   3 |   2 |
| 2016-11-19 |   5 |   5 |   1 |
| 2016-11-20 |   2 |   2 |   2 |
+------------+-----+-----+-----+
```

In [4]:
(booking
 .withColumn('checkout', date_add(col('booking_date'), col('nights')))
 .withColumn('floor', col('room_no').substr(1, 1))
 .filter(col('checkout').between('2016-11-14', '2016-11-20'))
 .replace({'1': '1st', '2': '2nd', '3': '3rd', '4': '4th'}, 
          subset=['floor'])
 .groupBy('checkout')
 .pivot('floor')
 .count()
 .orderBy('checkout')
 .toPandas())

Unnamed: 0,checkout,1st,2nd,3rd
0,2016-11-14,5,3,4
1,2016-11-15,6,4,1
2,2016-11-16,2,2,4
3,2016-11-17,5,3,6
4,2016-11-18,2,3,2
5,2016-11-19,5,5,1
6,2016-11-20,2,2,2


## 13.
Free rooms? List the rooms that are free on the day 25th Nov 2016.

```
+-----+
| id  |
+-----+
| 207 |
| 210 |
| 304 |
+-----+
```

In [5]:
(room.join(
    booking
    .withColumn('checkout', date_add(col('booking_date'), col('nights')-1))
    .filter((col('booking_date') <= '2016-11-25') & 
            (col('checkout') >= '2016-11-25')),
    on=(booking['room_no']==room['id']), how='left_anti')
 .select('id')
 .toPandas())

Unnamed: 0,id
0,207
1,210
2,304


## 14.
Single room for three nights required. A customer wants a single room for three consecutive nights. Find the first available date in December 2016.

```
+-----+------------+
| id  | MIN(i)     |
+-----+------------+
| 201 | 2016-12-11 |
+-----+------------+
```

In [6]:
(booking
 .withColumn('checkout', date_add(col('booking_date'), col('nights')))
 .join(room.filter(col('room_type')=='single'), 
       on=(booking['room_no']==room['id']), how='right')
 .filter(((col('checkout') >= '2016-12-01') & 
          (col('booking_date') <= '2016-12-31')) | 
         (col('checkout').isNull()))
 .withColumn('next_booking', lag('booking_date', 1, '2017-01-01').over(
     Window.partitionBy('room_no').orderBy('room_no')))
 .withColumn('diff', (col('next_booking')-col('checkout')).cast('integer'))
 .filter(col('diff') >= 3)
 .select('room_no', 'checkout')
 .orderBy('checkout')
 .limit(1)
 .toPandas())

Unnamed: 0,room_no,checkout
0,201,2016-12-11


## 15.
Gross income by week. Money is collected from guests when they leave. For each Thursday in November and December 2016, show the total amount of money collected from the previous Friday to that day, inclusive.

```
+------------+---------------+
| Thursday   | weekly_income |
+------------+---------------+
| 2016-11-03 |          0.00 |
| 2016-11-10 |      12608.94 |
| 2016-11-17 |      13552.56 |
| 2016-11-24 |      12929.69 |
| 2016-12-01 |      11685.14 |
| 2016-12-08 |      13093.79 |
| 2016-12-15 |       8975.87 |
| 2016-12-22 |       1395.77 |
| 2016-12-29 |          0.00 |
| 2017-01-05 |          0.00 |
+------------+---------------+
```

In [7]:
# generate Thursdays
import datetime

sdate = datetime.date(2016, 11, 1)
edate = datetime.date(2016, 12, 31)
days_to_thu = (3 - sdate.weekday()) % 7
week_diff = ((edate - sdate).days - days_to_thu) // 7

thur = sc.createDataFrame([
    (sdate + datetime.timedelta(days=days_to_thu + 7 * more_wks), )
    for more_wks in range(week_diff + 1)], ['Thursday'])
thur.show()

+----------+
|  Thursday|
+----------+
|2016-11-03|
|2016-11-10|
|2016-11-17|
|2016-11-24|
|2016-12-01|
|2016-12-08|
|2016-12-15|
|2016-12-22|
|2016-12-29|
+----------+



In [8]:
(booking
 .join(rate, on=(booking['occupants']==rate['occupancy']) & 
                (booking['room_type_requested']==rate['room_type']), 
       how='left')
 .withColumn('income', col('amount') * col('nights'))
 .select('booking_date', 'nights', 'income')
 .union(booking
        .join(extra, on='booking_id', how='left')
        .withColumn('income', col('amount'))
        .select('booking_date', 'nights', 'income'))
 .withColumn('checkout', date_add(col('booking_date'), col('nights')))
 .withColumn('wkd', dayofweek(col('checkout')))
 .withColumn('Thursday', when(
     col('wkd')<=5, date_add(col('checkout'), lit(5)-col('wkd')))
     .otherwise(date_add(col('checkout'), lit(12)-col('wkd'))))
 .join(thur, on='Thursday', how='right')
 .groupBy('Thursday')
 .agg(sum('income').alias('weekly_income'))
 .fillna(0)
 .orderBy('Thursday')
 .toPandas())

Unnamed: 0,Thursday,weekly_income
0,2016-11-03,0.0
1,2016-11-10,12608.94
2,2016-11-17,13552.56
3,2016-11-24,12929.69
4,2016-12-01,11685.14
5,2016-12-08,13093.79
6,2016-12-15,8975.87
7,2016-12-22,1395.77
8,2016-12-29,0.0


In [9]:
sc.stop()