# Helpdesk Hard

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app12-3') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
shift = sc.read.table('sqlzoo.Shift')
staff = sc.read.table('sqlzoo.Staff')
issue = sc.read.table('sqlzoo.Issue')
shift_type = sc.read.table('sqlzoo.Shift_type')
level = sc.read.table('sqlzoo.Level')
customer = sc.read.table('sqlzoo.Customer')
caller = sc.read.table('sqlzoo.Caller')

## 11.
Show the manager and number of calls received for each hour of the day on 2017-08-12

```
+---------+---------------+----+
| Manager | Hr            | cc |
+---------+---------------+----+
| LB1     | 2017-08-12 08 |  6 |
| LB1     | 2017-08-12 09 | 16 |
| LB1     | 2017-08-12 10 | 11 |
| LB1     | 2017-08-12 11 |  6 |
| LB1     | 2017-08-12 12 |  8 |
| LB1     | 2017-08-12 13 |  4 |
| AE1     | 2017-08-12 14 | 12 |
| AE1     | 2017-08-12 15 |  8 |
| AE1     | 2017-08-12 16 |  8 |
| AE1     | 2017-08-12 17 |  7 |
| AE1     | 2017-08-12 19 |  5 |
+---------+---------------+----+
```

In [3]:
(issue.withColumn('date_call', to_date(issue['Call_date']))
 .withColumn('hr', date_format(issue['Call_date'], 'yyyy-MM-dd HH'))
 .filter(col('date_call')=='2017-08-12')
 .join(shift, on=(col('Taken_by')==col('Operator')) & 
       (col('date_call')==col('Shift_date')))
 .groupBy('Manager', 'hr')
 .agg(count('Call_ref').alias('cc'))
 .orderBy('hr')
 .toPandas())

                                                                                

Unnamed: 0,Manager,hr,cc
0,LB1,2017-08-12 08,6
1,LB1,2017-08-12 09,16
2,LB1,2017-08-12 10,11
3,LB1,2017-08-12 11,6
4,LB1,2017-08-12 12,8
5,LB1,2017-08-12 13,4
6,AE1,2017-08-12 14,12
7,AE1,2017-08-12 15,8
8,AE1,2017-08-12 16,8
9,AE1,2017-08-12 17,7


## 12.
**80/20 rule. It is said that 80% of the calls are generated by 20% of the callers. Is this true? What percentage of calls are generated by the most active 20% of callers.**

Note - Andrew has not managed to do this in one query - but he believes it is possible.

```
+---------+
| t20pc   |
+---------+
| 32.2581 |
+---------+
```

In [4]:
(issue.groupBy('Caller_id')
 .agg(count('Caller_id').alias('n'))
 .orderBy(col('n').desc())
 .limit(int(issue[['Caller_id']].distinct().count() * 0.2))
 .agg(sum('n').alias('sum_n'))
 .withColumn('t20pc', 100 * col('sum_n') / issue.count())
 .select('t20pc')
 .toPandas())

Unnamed: 0,t20pc
0,32.258065


## 13.
**Annoying customers. Customers who call in the last five minutes of a shift are annoying. Find the most active customer who has never been annoying.**

```
+--------------+------+
| Company_name | abna |
+--------------+------+
| High and Co. |   20 |
+--------------+------+
```

In [5]:
annoy = (issue.withColumn('date_call', to_date(issue['Call_date']))
         .join(shift, on=(col('Taken_by')==col('Operator')) & 
                         (col('date_call')==col('Shift_date')))
         .join(shift_type, 'Shift_type')
         .join(caller, 'Caller_id')
         .join(customer, 'Company_ref')
         .select('Company_ref', 'Shift_date', 'End_time', 'Call_date')
         .withColumn('Shift_end', concat_ws(
             ' ', col('Shift_date').cast('string'), col('End_time')))
         .withColumn('till_shiftend', 
                     (to_timestamp(col('Shift_end')) - to_timestamp(col('Call_date')))
                     .cast('long'))
         .filter(col('till_shiftend') <= 5*60))

(issue.join(caller, 'Caller_id')
 .join(customer, 'Company_ref')
 .join(annoy, 'Company_ref', how='left_anti')
 .groupBy('Company_ref', 'Company_name')
 .agg(count('Caller_id').alias('abna'))
 .orderBy(col('abna').desc())
 .limit(1)
 .toPandas())

Unnamed: 0,Company_ref,Company_name,abna
0,146,High and Co.,20


## 14.
**Maximal usage. If every caller registered with a customer makes a call in one day then that customer has "maximal usage" of the service. List the maximal customers for 2017-08-13.**

```
+-------------------+--------------+-------------+
| company_name      | caller_count | issue_count |
+-------------------+--------------+-------------+
| Askew Inc.        |            2 |           2 |
| Bai Services      |            2 |           2 |
| Dasher Services   |            3 |           3 |
| High and Co.      |            5 |           5 |
| Lady Retail       |            4 |           4 |
| Packman Shipping  |            3 |           3 |
| Pitiable Shipping |            2 |           2 |
| Whale Shipping    |            2 |           2 |
+-------------------+--------------+-------------+
```

In [6]:
iss = (issue.filter((col('Call_date')>='2017-08-13') &
                    (col('Call_date')<'2017-08-14'))
       .join(caller, 'Caller_id', how='right')
       .join(customer, 'Company_ref', how='left')
       .groupBy('Caller_id', 'Company_ref', 'Company_name')
       .agg(count('Call_ref').alias('n'))
       .withColumn('iss', (col('n')>0).cast('integer')))

(iss.groupBy('Company_name')
 .agg(count('Company_ref').alias('caller_count'),
      sum('iss').alias('issue_count'))
 .filter(col('caller_count')==col('issue_count'))
 .orderBy('Company_name')
 .toPandas())

Unnamed: 0,Company_name,caller_count,issue_count
0,Askew Inc.,2,2
1,Bai Services,2,2
2,Dasher Services,3,3
3,High and Co.,5,5
4,Lady Retail,4,4
5,Packman Shipping,3,3
6,Pitiable Shipping,2,2
7,Whale Shipping,2,2


## 15.
**Consecutive calls occur when an operator deals with two callers within 10 minutes. Find the longest sequence of consecutive calls – give the name of the operator and the first and last call date in the sequence.**

```
+----------+---------------------+---------------------+-------+
| taken_by | first_call          | last_call           | calls |
+----------+---------------------+---------------------+-------+
| AB1      | 2017-08-14 09:06:00 | 2017-08-14 10:17:00 |    24 |
+----------+---------------------+---------------------+-------+
```

In [7]:
(issue.withColumn('lag', lag('Call_date', 1).over(
    Window.partitionBy('Taken_by').orderBy(['Taken_by', 'Call_date'])))
 .withColumn('flag', ((to_timestamp(col('Call_date')) - 
                       to_timestamp(col('lag'))).cast('long') > 10*60).cast('integer'))
 .withColumn('grp', sum('flag').over(
     Window.partitionBy('Taken_by').orderBy(['Taken_by', 'Call_date'])))
 .groupBy('Taken_by', 'grp')
 .agg(min('Call_date').alias('first_call'),
      max('Call_date').alias('last_call'),
      count('Caller_id').alias('n_calls'))
 .orderBy(col('n_calls').desc())
 .select('Taken_by', 'first_call', 'last_call', 'n_calls')
 .limit(1)
 .toPandas())

                                                                                

Unnamed: 0,Taken_by,first_call,last_call,n_calls
0,AB1,2017-08-14 09:06:00.0,2017-08-14 10:17:00.0,24


In [8]:
sc.stop()