# Congestion Charges - Medium

In [1]:
import findspark
import pandas as pd
findspark.init()

SVR = '192.168.31.31'
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import Window

sc = (SparkSession.builder.appName('app18-2') 
      .master(f'spark://{SVR}:7077') 
      .config('spark.sql.warehouse.dir', f'hdfs://{SVR}:9000/user/hive/warehouse') 
      .config('spark.cores.max', '4') 
      .config('spark.executor.instances', '1') 
      .config('spark.executor.cores', '2') 
      .config('spark.executor.memory', '10g') 
      .enableHiveSupport().getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
camera = sc.read.table('sqlzoo.camera')
keeper = sc.read.table('sqlzoo.keeper')
vehicle = sc.read.table('sqlzoo.vehicle')
image = sc.read.table('sqlzoo.image')
permit = sc.read.table('sqlzoo.permit')

## 1.
List the owners (name and address) of Vehicles caught by camera 1 or 18 without duplication.

In [3]:
(keeper.withColumnRenamed('id', 'keeper')
 .join(vehicle, on='keeper')
 .join(image.filter(image['camera'].isin([1, 18])), 
       on=(vehicle['id']==image['reg']))
 .select('name', 'address')
 .distinct()
 .toPandas())

                                                                                

Unnamed: 0,name,address
0,"Ambiguous, Arthur",Absorption Ave.
1,"Inconspicuous, Iain",Interception Rd.
2,"Strenuous, Sam",Surjection Street


## 2.
Show keepers (name and address) who have more than 5 vehicles.

In [4]:
(keeper.withColumnRenamed('id', 'keeper')
 .join(vehicle, on='keeper')
 .groupBy('name', 'address')
 .agg(count('keeper').alias('keeper'))
 .filter(col('keeper')>5)
 .select('name', 'address')
 .toPandas())

Unnamed: 0,name,address
0,"Ambiguous, Arthur",Absorption Ave.
1,"Inconspicuous, Iain",Interception Rd.


## 3.
For each vehicle show the number of current permits (suppose today is the 1st of Feb 2007). The list should include the vehicle.s registration and the number of permits. Current permits can be determined based on charge types, e.g. for weekly permit you can use the date after 24 Jan 2007 and before 02 Feb 2007.

In [5]:
(permit.withColumn('sdate', to_timestamp(col('sdate')))
 .withColumn('edate', when(
    col('chargetype')=='Daily', col('sdate') + expr('interval 1 day')).when(
    col('chargetype')=='Weekly', col('sdate') + expr('interval 1 week')).when(
    col('chargetype')=='Monthly', col('sdate') + expr('interval 1 month')).when(
    col('chargetype')=='Annual', col('sdate') + expr('interval 1 year')))
 .filter((col('sdate') <= '2007-02-01') & (col('edate') >= '2007-02-01'))
 .groupBy('reg')
 .agg(count('chargetype').alias('reg'))
 .orderBy('reg')
 .toPandas())

Unnamed: 0,reg,reg.1
0,SO 02 DSP,1
1,SO 02 DTP,1
2,SO 02 JSP,1
3,SO 02 KSP,1
4,SO 02 KTP,1
5,SO 02 QSP,1
6,SO 02 RSP,1


## 4.
Obtain a list of every vehicle passing camera 10 on 25th Feb 2007. Show the time, the registration and the name of the keeper if available.

In [6]:
(vehicle.join(keeper.withColumnRenamed('id', 'keeper'), on='keeper')
 .join(image.filter((image['whn'].between('2007-02-25', '2007-02-26')) &
                    (image['camera']==10)), on=(vehicle['id']==image['reg']))
 .select('reg', 'whn', 'name')
 .toPandas())

Unnamed: 0,reg,whn,name
0,SO 02 CSP,2007-02-25 07:45:11.0,"Ambiguous, Arthur"
1,SO 02 ESP,2007-02-25 18:08:40.0,"Ambiguous, Arthur"


## 5.
List the keepers who have more than 4 vehicles and one of them must have more than 2 permits. The list should include the names and the number of vehicles.

In [7]:
# keepers who have more than 2 permits
t = (vehicle.join(permit
                  .groupBy('reg')
                  .agg(count('chargetype').alias('npermit'))
                  .filter(col('npermit')>2)
                  .select('reg').alias('p'), 
                  on=(vehicle['id']==col('reg')))
    .select('keeper'))

(vehicle.join(keeper.withColumnRenamed('id', 'keeper')
              .join(t, on='keeper'),
              on='keeper')
 .groupBy('name')
 .agg(count('id').alias('n_vehicle'))
 .select('name', 'n_vehicle')
 .filter(col('n_vehicle')>4)
 .toPandas())

Unnamed: 0,name,n_vehicle
0,"Inconspicuous, Iain",7


In [8]:
sc.stop()