# Neeps - Hard

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app15-3') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
import datetime

In [3]:
ut_staff = sc.read.table('sqlzoo.ut_staff')
ut_student = sc.read.table('sqlzoo.ut_student')
ut_event = sc.read.table('sqlzoo.ut_event')
ut_room = sc.read.table('sqlzoo.ut_room')
ut_attends = sc.read.table('sqlzoo.ut_attends')
ut_teaches = sc.read.table('sqlzoo.ut_teaches')
ut_occurs = sc.read.table('sqlzoo.ut_occurs')
ut_modle = sc.read.table('sqlzoo.ut_modle')
ut_week = sc.read.table('sqlzoo.ut_week')

## 11.
**co.CHt is to be given all the teaching that co.ACg currently does. Identify those events which will clash.**

In [4]:
t = (ut_event.withColumnRenamed('id', 'event')
     .join(ut_teaches, on='event')
     .join(ut_occurs, on='event')
     .withColumn('begin', to_timestamp(col('tod'), format='HH:mm'))
     .withColumn('end', (unix_timestamp(col('tod'), format='HH:mm') + 
                 (3600 * col('duration'))).cast('timestamp')))

(t.filter(col('staff')=='co.CHt').alias('t1')
 .join(t.filter(col('staff')=='co.ACg').alias('t2'), 
       on=['week', 'dow'])
 .filter(((col('t1.begin') >= col('t2.begin')) & 
          (col('t1.begin') < col('t2.end'))) |
         ((col('t2.begin') >= col('t1.begin')) & 
          (col('t2.begin') < col('t1.end'))))
 .select(col('t1.event').alias('co.Cht'), 
         col('t2.event').alias('co.ACg'))
 .distinct()
 .toPandas())

                                                                                

Unnamed: 0,co.Cht,co.ACg
0,co12005.T03,co12005.T01
1,co12005.T03,co72013.L02
2,co12005.T03,co72013.T03


## 12.
**Produce a table showing the utilisation rate and the occupancy level for all rooms with a capacity more than 60.**

> I don't know how 'utilisation rate' and 'occupancy level' are defined.

In [5]:
# theoretical full utilisation hours: 15 weeks * 5 days * 12 hours
close_time = (ut_event
              .withColumn('close_time', 
                          (unix_timestamp(col('tod'), format='HH:mm') + 
                          3600*col('duration')).cast('timestamp'))
              .agg({'close_time': 'max'}))
open_time = (ut_event
             .withColumn('open_time', 
                         to_timestamp(col('tod'), format='HH:mm'))
             .agg({'open_time': 'min'}))
max_hrs = (close_time.collect()[0][0] -
           open_time.collect()[0][0]).seconds // 3600
t = ut_week.withColumn('max_hrs', 5 * lit(max_hrs))

# attended students per event
s = (ut_event.withColumnRenamed('id', 'event')
     .join(ut_occurs, on='event')
     .join(ut_attends, on='event')
     .join(ut_student, on=(ut_attends['student']==ut_student['id']))
     .join(ut_room, on=(ut_event['room']==ut_room['id']))
     .groupBy('room')
     .agg(sum('sze'), sum('capacity')))

(ut_event.withColumnRenamed('id', 'event')
 .join(ut_occurs, on='event')
 .join(ut_room.filter(col('capacity') > 60), 
       on=(ut_event['room']==ut_room['id']))
 .groupBy('room')
 .sum('duration')
 .join(s, on='room')
 .withColumn('util', col('sum(duration)') /
             t.groupBy().sum('max_hrs').collect()[0][0])
 .withColumn('occup_level', col('sum(sze)')/col('sum(capacity)'))
 .select('room', 'util', 'occup_level')
 .toPandas())

                                                                                

Unnamed: 0,room,util,occup_level
0,cr.SMH,0.24,0.274574


## 13.
**A one hour staff meeting is to be held between 09:00 and 17:00. Events which clash are to be cancelled. Identify the hour which will result in the least disruption.**

In [6]:
t = (ut_event.withColumn('tod', to_timestamp(ut_event['tod'], format='HH:mm'))
     .withColumn('duration', ut_event['duration'].cast('integer'))
     .join(ut_occurs, on=(ut_event['id']==ut_occurs['event']))
     .select('event', 'week', 'dow', 'tod', 'duration'))

for i in range(2, 10):
    t_ = t.filter(t['duration']==i)
    if t_.count() > 0:
        t_ = t_.withColumn('tod', t_['tod'] + expr(f'interval {i} hours'))
        t = t.union(t_)

(t.replace({'Monday': '0', 'Tuesday': '1', 'Wednesday': '2', 
            'Thursday': '3', 'Friday': '4'}, subset=['dow'])
 .withColumn('hour', hour(col('tod')))
 .groupBy('dow', 'hour')
 .pivot('week')
 .count()
 .orderBy('dow', 'hour')
 .toPandas())

Unnamed: 0,dow,hour,01,02,03,04,05,06,07,08,10,11,12,13
0,0,9,4,4,4,4,4,4,4,4,4,4,4,4
1,0,10,1,1,1,1,1,1,1,1,1,1,1,1
2,0,11,8,8,8,8,8,8,8,8,8,8,8,8
3,0,12,3,3,3,3,3,3,3,3,3,3,3,3
4,0,13,10,10,10,10,10,10,10,10,10,10,10,10
5,0,14,5,5,5,5,5,5,5,5,5,5,5,5
6,0,15,10,10,10,10,10,10,10,10,10,10,10,10
7,0,16,5,5,5,5,5,5,5,5,5,5,5,5
8,0,17,8,8,8,8,8,8,8,8,8,8,8,8
9,0,18,1,1,1,1,1,1,1,1,1,1,1,1


In [7]:
# You may find that week '09', '14', '15' are totally vacant
# or you can choose Mon 9am/12pm, Tue 5pm, Thu 9am, Fri 5pm

(t.withColumn('tod', date_format(t['tod'], 'HH:mm'))
 .filter(hour(col('tod')).between(9, 17))
 .groupBy('tod', 'dow')
 .agg(count('event').alias('event'))
 .orderBy('event', 'tod', 'dow')
 .limit(1)
 .toPandas())

Unnamed: 0,tod,dow,event
0,10:00,Monday,12


## 14.
**Find all clashes - include the events which clash and the staff, student or rooms that they have in common.**

In [8]:
t = (ut_event
     .withColumn('start', to_timestamp(ut_event['tod'], format='HH:mm'))
     .withColumnRenamed('id', 'event')
     .join(ut_occurs, on='event')
     .withColumn('end', (unix_timestamp(col('start')) + 3600*col('duration'))
                 .cast('timestamp')))
(t.alias('t1').join(t.alias('t2'), on=['week', 'dow', 'room'])
 .filter(col('t1.event') != col('t2.event'))
 .filter((((col('t1.start') >= col('t2.start')) &
           (col('t1.start') < col('t2.end'))) |
          ((col('t2.start') >= col('t1.start')) &
           (col('t2.start') < col('t1.end')))) &
         (col('t1.event') > col('t2.event')))
 .select(col('t1.event').alias('event1'),
         col('t2.event').alias('event2'),
         'room', 'week', 'dow')
 .distinct()
 .toPandas())

Unnamed: 0,event1,event2,room,week,dow
0,coh8412615.T03,co22008.T04,co.117+118,11,Tuesday
1,coh8412615.T03,co22008.T04,co.117+118,2,Tuesday
2,coh8412615.T03,co22008.T04,co.117+118,8,Tuesday
3,coh8412615.T03,co22008.T04,co.117+118,3,Tuesday
4,coh8412615.T03,co22008.T04,co.117+118,10,Tuesday
5,coh8412615.T03,co22008.T04,co.117+118,12,Tuesday
6,coh8412615.T03,co22008.T04,co.117+118,4,Tuesday
7,coh8412615.T03,co22008.T04,co.117+118,1,Tuesday
8,coh8412615.T03,co22008.T04,co.117+118,5,Tuesday
9,coh8412615.T03,co22008.T04,co.117+118,6,Tuesday


## 15.
**Produce a timetable for a group of full time students for week 1**

In [9]:
t = (ut_event
     .withColumn('tod', to_timestamp(ut_event['tod'], format='HH:mm'))
     .withColumnRenamed('id', 'event')
     .join(ut_occurs.filter(col('week')=='01'), on='event')
     .join(ut_attends, on='event'))
for i in range(2, 10):
    t_ = t.filter(t['duration']==i)
    if t_.count() > 0:
        t_ = t_.withColumn('tod', t_['tod'] + expr(f'interval {i} hours'))
        t = t.union(t_)

(t.withColumn('tod', hour(col('tod')))
 .withColumn('flag', lit('Y'))
 .groupBy('dow', 'student')
 .pivot('tod')
 .agg(first('flag'))
 .fillna('')
 .toPandas())

Unnamed: 0,dow,student,9,10,11,12,13,14,15,16,17,18,20
0,Friday,co.12008.Eb,,,,,,Y,,Y,,,
1,Friday,co.22022.E,Y,Y,,Y,,,,,,,
2,Friday,co1.CO,,Y,,,,,,,,,
3,Friday,co2.CO,,,,,,,Y,,Y,,
4,Friday,co2.CO.a,Y,,Y,,Y,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,Wednesday,com.ITeC.feb,,,,,,Y,,Y,,,
191,Wednesday,com.MM.feb,Y,,Y,Y,,,,,,,
192,Wednesday,com.MM.pt3,Y,,Y,Y,,,,,,,
193,Wednesday,com.SE,,,,,,Y,,Y,,,


In [10]:
sc.stop()