# Prepare

In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col, expr
import pandas as pd

## Part 1
1. Read the case.csv file from the 311 call data into a Spark DataFrame.
1. How old is the latest (in terms of days past SLA) currently open issue? How long has the oldest (in terms of days since opened) currently opened issue been open?
1. How many Stray Animal cases are there?
1. How many service requests that are assigned to the Field Operations department (dept_division) are not classified as "Officer Standby" request type (service_request_type)?
1. Create a new DataFrame without any information related to dates or location.
1. Read dept.csv into a Spark DataFrame. Inspect the dept_name column. Replace the missing values with "other".

##### Creating Spark Session object

In [2]:
spark = SparkSession.builder.master('local').appName('transform1').getOrCreate()

##### Reading in csv with spark

In [3]:
df = spark.read.csv('sa311/case.csv', header=True, inferSchema=True)

In [4]:
df.show(3)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 

In [5]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



##### How old is the latest (in terms of days past SLA) currently open issue? 

In [6]:
(df.select(F.to_date(col('SLA_due_date'),format='M/d/y H:m').alias('SLA_date'),
           F.current_date().alias('today'),
           col('case_opened_date').alias('date_opened'),
           col('case_closed').alias('closed'))
 .where(col('closed') == 'NO')
 .withColumn('date_diff_SLA_2_today', F.datediff(col('SLA_date'), col('today')))
 .sort(col('date_opened'), ascending=False)
 .show(1))

+----------+----------+------------+------+---------------------+
|  SLA_date|     today| date_opened|closed|date_diff_SLA_2_today|
+----------+----------+------------+------+---------------------+
|2017-11-14|2019-05-17|9/9/17 11:43|    NO|                 -549|
+----------+----------+------------+------+---------------------+
only showing top 1 row



##### How long has the oldest (in terms of days since opened) currently opened issue been open?

In [7]:
(df.select(F.to_date(col('SLA_due_date'), format='M/d/y H:m').alias('SLA_date'),
           F.current_date().alias('today'),
           F.to_date(col('case_opened_date'), format='M/d/y H:m').alias('date_opened'),
           col('case_closed').alias('closed'))
 .where(col('closed') == 'NO')
 .withColumn('date_diff_open_2_today', F.datediff(col('date_opened'), col('today')))
 .sort(col('date_opened'))
 .select(col('date_diff_open_2_today'))
 .show(1))

+----------------------+
|date_diff_open_2_today|
+----------------------+
|                  -866|
+----------------------+
only showing top 1 row



##### How many Stray Animal cases are there?

In [8]:
df.show(3)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 

In [9]:
df.select('*').where(col('service_request_type') == "Stray Animal").count()

26760

##### How many service requests that are assigned to the Field Operations department (dept_division) are not classified as "Officer Standby" request type (service_request_type)?

In [10]:
(df.select(col('dept_division'), col('service_request_type'))
 .where(col('dept_division') == 'Field Operations')
 .where(col('service_request_type') != 'Officer Standby')
 .count())

113902

##### Create a new DataFrame without any information related to dates or location.

In [11]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [12]:
new_df = df.select('case_id', 'case_late', 'num_days_late', 'case_closed', 'dept_division',
                   'service_request_type', 'SLA_days', 'case_status', 'source_id', 'council_district')

### Read dept.csv into a Spark DataFrame. Inspect the dept_name column. 
Replace the missing values with "other"

In [13]:
dept_df = spark.read.csv('sa311/dept.csv', header=True, inferSchema=True)
dept_df.show(5)

+--------------------+--------------------+----------------------+-------------------+
|       dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+--------------------+--------------------+----------------------+-------------------+
|     311 Call Center|    Customer Service|      Customer Service|                YES|
|               Brush|Solid Waste Manag...|           Solid Waste|                YES|
|     Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
|Clean and Green N...|Parks and Recreation|    Parks & Recreation|                YES|
|    Code Enforcement|Code Enforcement ...|  DSD/Code Enforcement|                YES|
+--------------------+--------------------+----------------------+-------------------+
only showing top 5 rows



In [14]:
dept_df.na.fill({'dept_name' : 'other'}).show(8)

+--------------------+--------------------+----------------------+-------------------+
|       dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+--------------------+--------------------+----------------------+-------------------+
|     311 Call Center|    Customer Service|      Customer Service|                YES|
|               Brush|Solid Waste Manag...|           Solid Waste|                YES|
|     Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
|Clean and Green N...|Parks and Recreation|    Parks & Recreation|                YES|
|    Code Enforcement|Code Enforcement ...|  DSD/Code Enforcement|                YES|
|Code Enforcement ...|Code Enforcement ...|  DSD/Code Enforcement|                YES|
|Code Enforcement ...|               other|  DSD/Code Enforcement|                YES|
|   Dangerous Premise|Code Enforcement ...|  DSD/Code Enforcement|                YES|
+--------------------+--------------------+

Useful to see all nulls per column.

In [15]:
dept_df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in dept_df.columns]).show()

+-------------+---------+----------------------+-------------------+
|dept_division|dept_name|standardized_dept_name|dept_subject_to_SLA|
+-------------+---------+----------------------+-------------------+
|            0|        1|                     0|                  0|
+-------------+---------+----------------------+-------------------+



## Part 2
1. Convert the council_district column to a string column.
1. Extract the year from the case_closed_date column.
1. Convert num_days_late from days to hours in new columns num_hours_late.
1. Convert the case_late column to a boolean column.
1. Convert the SLA_days columns to a double column.
1. Pull it all together

In [16]:
df.show(3)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 

##### Convert the council_district column to a string column.

In [17]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [18]:
df.select(col('council_district').cast('string')).printSchema()

root
 |-- council_district: string (nullable = true)



##### Extract the year from the case_closed_date column.

In [19]:
(df.select(col('case_closed_date'), 
          F.regexp_extract(col('case_closed_date'), '\d+/\d+/(\d+)', 1).alias('year'))
.show(5))

+----------------+----+
|case_closed_date|year|
+----------------+----+
|    1/1/18 12:29|  18|
|     1/3/18 8:11|  18|
|     1/2/18 7:57|  18|
|     1/2/18 8:13|  18|
|    1/1/18 13:29|  18|
+----------------+----+
only showing top 5 rows



##### Convert num_days_late from days to hours in new columns num_hours_late.

In [20]:
df.select(col('num_days_late')).withColumn('num_hours_late', F.round(df.num_days_late * 24, 2)).show(5)

+-------------------+--------------+
|      num_days_late|num_hours_late|
+-------------------+--------------+
| -998.5087616000001|     -23964.21|
|-2.0126041669999997|         -48.3|
|       -3.022337963|        -72.54|
|       -15.01148148|       -360.28|
|0.37216435200000003|          8.93|
+-------------------+--------------+
only showing top 5 rows



##### Convert the case_late column to a boolean column.

In [21]:
df.select(col('case_late'), col('num_days_late')).withColumn('case_late', col('case_late') == 'YES').show(5)

+---------+-------------------+
|case_late|      num_days_late|
+---------+-------------------+
|    false| -998.5087616000001|
|    false|-2.0126041669999997|
|    false|       -3.022337963|
|    false|       -15.01148148|
|     true|0.37216435200000003|
+---------+-------------------+
only showing top 5 rows



##### Convert the SLA_days columns to a double column.
It already is a double column, but might have needed to import the 'double' type to cast it if it wasn't.

##### Pull it all together

In [22]:
(df.select(col('council_district').cast('string'), 
           col('case_closed_date'), 
           F.regexp_extract(col('case_closed_date'), '\d+/\d+/(\d+)', 1).alias('year'), 
           col('case_late'), 
           col('num_days_late'))
 .withColumn('num_hours_late', F.round(df.num_days_late * 24, 2))
 .withColumn('case_late_bool', col('case_late') == 'YES').show(5))

+----------------+----------------+----+---------+-------------------+--------------+--------------+
|council_district|case_closed_date|year|case_late|      num_days_late|num_hours_late|case_late_bool|
+----------------+----------------+----+---------+-------------------+--------------+--------------+
|               5|    1/1/18 12:29|  18|       NO| -998.5087616000001|     -23964.21|         false|
|               3|     1/3/18 8:11|  18|       NO|-2.0126041669999997|         -48.3|         false|
|               3|     1/2/18 7:57|  18|       NO|       -3.022337963|        -72.54|         false|
|               3|     1/2/18 8:13|  18|       NO|       -15.01148148|       -360.28|         false|
|               7|    1/1/18 13:29|  18|      YES|0.37216435200000003|          8.93|          true|
+----------------+----------------+----+---------+-------------------+--------------+--------------+
only showing top 5 rows



## Part 3
1. Create a DataFrame with all combinations of council_district and service_request_type (regardless of whether the combination is observed in the data).
1. Join the case data with the source and department data.
1. Are there any cases that do not have a request source?

##### Create a DataFrame with all combinations of council_district and service_request_type 
(regardless of whether the combination is observed in the data).

In [23]:
df.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [24]:
a = (df.select(col('council_district')).dropDuplicates())

b = (df.select(col('service_request_type')).dropDuplicates())

b.crossJoin(a).orderBy(['service_request_type', 'council_district']).show(truncate=False)

+------------------------+----------------+
|service_request_type    |council_district|
+------------------------+----------------+
|"Vacant Lot-City (12"")"|0               |
|"Vacant Lot-City (12"")"|1               |
|"Vacant Lot-City (12"")"|2               |
|"Vacant Lot-City (12"")"|3               |
|"Vacant Lot-City (12"")"|4               |
|"Vacant Lot-City (12"")"|5               |
|"Vacant Lot-City (12"")"|6               |
|"Vacant Lot-City (12"")"|7               |
|"Vacant Lot-City (12"")"|8               |
|"Vacant Lot-City (12"")"|9               |
|"Vacant Lot-City (12"")"|10              |
|"Vacant Lot-City (48"")"|0               |
|"Vacant Lot-City (48"")"|1               |
|"Vacant Lot-City (48"")"|2               |
|"Vacant Lot-City (48"")"|3               |
|"Vacant Lot-City (48"")"|4               |
|"Vacant Lot-City (48"")"|5               |
|"Vacant Lot-City (48"")"|6               |
|"Vacant Lot-City (48"")"|7               |
|"Vacant Lot-City (48"")"|8     

##### Join the case data with the source and department data.

In [25]:
source_df = spark.read.csv('sa311/source.csv', header=True, inferSchema=True)

In [26]:
source_df.show(3)

+---------+----------------+
|source_id| source_username|
+---------+----------------+
|   100137|Merlene Blodgett|
|   103582|     Carmen Cura|
|   106463| Richard Sanchez|
+---------+----------------+
only showing top 3 rows



In [27]:
df.show(3)

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 

In [28]:
dept_df.show(3)

+---------------+--------------------+----------------------+-------------------+
|  dept_division|           dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+--------------------+----------------------+-------------------+
|311 Call Center|    Customer Service|      Customer Service|                YES|
|          Brush|Solid Waste Manag...|           Solid Waste|                YES|
|Clean and Green|Parks and Recreation|    Parks & Recreation|                YES|
+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows



In [29]:
(df.join(source_df, 
         df.source_id == source_df.source_id, 
         'left')
 .join(dept_df, 
       df.dept_division == dept_df.dept_division, 
       'left')).limit(100).toPandas()

Unnamed: 0,case_id,case_opened_date,case_closed_date,SLA_due_date,case_late,num_days_late,case_closed,dept_division,service_request_type,SLA_days,case_status,source_id,request_address,council_district,source_id.1,source_username,dept_division.1,dept_name,standardized_dept_name,dept_subject_to_SLA
0,1014127332,1/1/18 0:42,1/1/18 12:29,9/26/20 0:42,NO,-998.508762,YES,Field Operations,Stray Animal,999.000000,Closed,svcCRMLS,"2315 EL PASO ST, San Antonio, 78207",5,svcCRMLS,svcCRMLS,Field Operations,Animal Care Services,Animal Care Services,YES
1,1014127333,1/1/18 0:46,1/3/18 8:11,1/5/18 8:30,NO,-2.012604,YES,Storm Water,Removal Of Obstruction,4.322222,Closed,svcCRMSS,"2215 GOLIAD RD, San Antonio, 78223",3,svcCRMSS,svcCRMSS,Storm Water,Trans & Cap Improvements,Trans & Cap Improvements,YES
2,1014127334,1/1/18 0:48,1/2/18 7:57,1/5/18 8:30,NO,-3.022338,YES,Storm Water,Removal Of Obstruction,4.320729,Closed,svcCRMSS,"102 PALFREY ST W, San Antonio, 78223",3,svcCRMSS,svcCRMSS,Storm Water,Trans & Cap Improvements,Trans & Cap Improvements,YES
3,1014127335,1/1/18 1:29,1/2/18 8:13,1/17/18 8:30,NO,-15.011481,YES,Code Enforcement,Front Or Side Yard Parking,16.291887,Closed,svcCRMSS,"114 LA GARDE ST, San Antonio, 78223",3,svcCRMSS,svcCRMSS,Code Enforcement,Code Enforcement Services,DSD/Code Enforcement,YES
4,1014127336,1/1/18 1:34,1/1/18 13:29,1/1/18 4:34,YES,0.372164,YES,Field Operations,Animal Cruelty(Critical),0.125000,Closed,svcCRMSS,"734 CLEARVIEW DR, San Antonio, 78228",7,svcCRMSS,svcCRMSS,Field Operations,Animal Care Services,Animal Care Services,YES
5,1014127337,1/1/18 6:28,1/1/18 14:38,1/31/18 8:30,NO,-29.743981,YES,Signals,Traffic Signal Ops and Maintenance,30.084468,Closed,svcCRMSS,BANDERA RD and BRESNAHAN,7,svcCRMSS,svcCRMSS,Signals,Trans & Cap Improvements,Trans & Cap Improvements,YES
6,1014127338,1/1/18 6:57,1/2/18 15:32,1/17/18 8:30,NO,-14.706736,YES,Code Enforcement,Front Or Side Yard Parking,16.064294,Closed,svcCRMSS,"10133 FIGARO CANYON, San Antonio, 78251",4,svcCRMSS,svcCRMSS,Code Enforcement,Code Enforcement Services,DSD/Code Enforcement,YES
7,1014127339,1/1/18 6:58,1/2/18 15:32,1/17/18 8:30,NO,-14.706620,YES,Code Enforcement,Front Or Side Yard Parking,16.063796,Closed,svcCRMSS,"10133 FIGARO CANYON, San Antonio, 78251",4,svcCRMSS,svcCRMSS,Code Enforcement,Code Enforcement Services,DSD/Code Enforcement,YES
8,1014127340,1/1/18 6:58,1/2/18 15:32,1/17/18 8:30,NO,-14.706620,YES,Code Enforcement,Right Of Way/Sidewalk Obstruction,16.063333,Closed,svcCRMSS,"10133 FIGARO CANYON, San Antonio, 78251",4,svcCRMSS,svcCRMSS,Code Enforcement,Code Enforcement Services,DSD/Code Enforcement,YES
9,1014127341,1/1/18 6:59,1/2/18 15:32,1/17/18 8:30,NO,-14.706493,YES,Code Enforcement,Front Or Side Yard Parking,16.062859,Closed,svcCRMSS,"10133 FIGARO CANYON, San Antonio, 78251",4,svcCRMSS,svcCRMSS,Code Enforcement,Code Enforcement Services,DSD/Code Enforcement,YES


##### Are there any cases that do not have a request source?
No

In [30]:
(df.join(source_df, 
         df.source_id == source_df.source_id, 
         'left')
 .join(dept_df, 
       df.dept_division == dept_df.dept_division, 
       'left')).where(F.isnull(df.source_id)).show()

+-------+----------------+----------------+------------+---------+-------------+-----------+-------------+--------------------+--------+-----------+---------+---------------+----------------+---------+---------------+-------------+---------+----------------------+-------------------+
|case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|num_days_late|case_closed|dept_division|service_request_type|SLA_days|case_status|source_id|request_address|council_district|source_id|source_username|dept_division|dept_name|standardized_dept_name|dept_subject_to_SLA|
+-------+----------------+----------------+------------+---------+-------------+-----------+-------------+--------------------+--------+-----------+---------+---------------+----------------+---------+---------------+-------------+---------+----------------------+-------------------+
+-------+----------------+----------------+------------+---------+-------------+-----------+-------------+--------------------+--------+---------

## Part 4
1. Who are the top 10 service request types in terms of number of requests?
1. Who are the top 10 service request types in terms of average days late?
1. Does number of days late depend on department?
1. How do number of days late depend on department division and request type?

In [31]:
full = (df.join(source_df, 
         'source_id', 
         'left')
        .join(dept_df, 
       'dept_division', 
       'left'))

##### What are the top 10 service request types in terms of number of requests?

In [37]:
full.groupBy(full.service_request_type).count().sort('count', ascending=False).show(10, truncate=False)

+--------------------------------+-----+
|service_request_type            |count|
+--------------------------------+-----+
|No Pickup                       |89210|
|Overgrown Yard/Trash            |66403|
|Bandit Signs                    |32968|
|Damaged Cart                    |31163|
|Front Or Side Yard Parking      |28920|
|Stray Animal                    |27361|
|Aggressive Animal(Non-Critical) |25492|
|Cart Exchange Request           |22608|
|Junk Vehicle On Private Property|21649|
|Pot Hole Repair                 |20827|
+--------------------------------+-----+
only showing top 10 rows



##### What are the top 10 service request types in terms of average days late?

In [38]:
(full.select(full.service_request_type,
             full.num_days_late)
 .groupBy(full.service_request_type)
 .avg('num_days_late')
 .sort('avg(num_days_late)', ascending=False)
 .show(10, truncate=False))

+--------------------------------------+------------------+
|service_request_type                  |avg(num_days_late)|
+--------------------------------------+------------------+
|Zoning: Junk Yards                    |175.95636210420932|
|Labeling for Used Mattress            |162.43032902285717|
|Record Keeping of Used Mattresses     |153.99724039428568|
|Signage Requied for Sale of Used Mattr|151.63868055333333|
|Storage of Used Mattress              |142.112556415     |
|Zoning: Recycle Yard                  |135.9285161247979 |
|Donation Container Enforcement        |131.75610506358706|
|License Requied Used Mattress Sales   |128.79828704142858|
|Traffic Signal Graffiti               |101.79846062200002|
|Complaint                             |72.87050230311685 |
+--------------------------------------+------------------+
only showing top 10 rows



##### Does number of days late depend on department?

In [34]:
(full.select(full.dept_name, full.num_days_late)
 .groupBy('dept_name')
 .avg('num_days_late')
 .show())

+--------------------+-------------------+
|           dept_name| avg(num_days_late)|
+--------------------+-------------------+
|Animal Care Services| -226.5178394055038|
|                null|  135.9285161247979|
|Solid Waste Manag...|-2.2000575136721308|
|Development Services| 13.433724555869683|
|Trans & Cap Impro...| -20.61283735405259|
|    Customer Service| 59.737091496300735|
|        Metro Health| -4.911766979607019|
|Parks and Recreation| -5.251521960055141|
|Code Enforcement ...| -38.70133068329481|
|        City Council|               null|
+--------------------+-------------------+



##### How do number of days late depend on department division and request type?

In [35]:
(full.select(full.dept_division, full.num_days_late, full.service_request_type)
 .groupBy(['dept_division', 'service_request_type'])
 .avg('num_days_late')
 .orderBy(['dept_division', 'service_request_type'])
 .show())

+---------------+--------------------+--------------------+
|  dept_division|service_request_type|  avg(num_days_late)|
+---------------+--------------------+--------------------+
|311 Call Center|           Complaint|   72.87050230311685|
|311 Call Center|          Compliment|   -8.03018814654584|
|          Brush|Brush - Out of Cy...|  -4.316590201619218|
|          Brush|Brush / Bulky Mis...| -3.0420300215330762|
|          Brush|Brush Missed Pick up|  -5.879397585624517|
|          Brush|     Brush No Notice|  -2.938521994696245|
|          Brush|Brush Partial Pic...|  -5.288008891714286|
|          Brush|Brush Property Da...|  -4.930477772768818|
|          Brush|Brush-non Complia...| -2.7342875787021272|
|          Brush|Brush_Received In...|  -7.224293981400001|
|          Brush| Bulk Missed Pick up|  -3.447529808084811|
|          Brush|Bulk Partial Pick Up|-0.45940326771153916|
|          Brush|      Bulk no Notice|  -2.998967278570979|
|          Brush|Bulk-non Complian...| -