In [10]:
import pandas as pd

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType
spark = SparkSession.builder.getOrCreate()

In [11]:
source = (spark.read.csv("source.csv",
                     sep=",",
                     header=True,
                     inferSchema=True)
     )

In [12]:
dept = (spark.read.csv("dept.csv",
                     sep=",",
                     header=True,
                     inferSchema=True)
     )

In [39]:
case = (spark.read.csv("case.csv",
                     sep=",",
                     header=True,
                     inferSchema=True)
     )

In [40]:
source.write.json("source_json", mode="overwrite")
dept.write.json("dept_json", mode = 'overwrite')
case.write.json('case_json', mode = 'overwrite')

In [41]:
source.printSchema()

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)



In [97]:
source.show(50)

+---------+--------------------+
|source_id|     source_username|
+---------+--------------------+
|   100137|    Merlene Blodgett|
|   103582|         Carmen Cura|
|   106463|     Richard Sanchez|
|   119403|      Betty De Hoyos|
|   119555|      Socorro Quiara|
|   119868| Michelle San Miguel|
|   120752|      Eva T. Kleiber|
|   124405|           Lori Lara|
|   132408|       Leonard Silva|
|   135723|        Amy Cardenas|
|   136202|    Michelle Urrutia|
|   136979|      Leticia Garcia|
|   137943|    Pamela K. Baccus|
|   138605|        Marisa Ozuna|
|   138650|      Kimberly Green|
|   138650|Kimberly Green-Woods|
|   138793| Guadalupe Rodriguez|
|   138810|       Tawona Martin|
|   139342|     Jessica Mendoza|
|   139344|        Isis Mendoza|
|   139345|      Andrea Alvarez|
|   139807|        Jerry Robles|
|   139868|        James Garcia|
|   140436|         Jose Acosta|
|   140507|      Beatriz Urbina|
|   140508|San Juanita Villa...|
|   140509|           Renee Key|
|   140637

In [93]:
source.count()

140

In [94]:
case.count()

841704

In [43]:
dept.printSchema()

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)



In [44]:
dept.show(1)

+---------------+----------------+----------------------+-------------------+
|  dept_division|       dept_name|standardized_dept_name|dept_subject_to_SLA|
+---------------+----------------+----------------------+-------------------+
|311 Call Center|Customer Service|      Customer Service|                YES|
+---------------+----------------+----------------------+-------------------+
only showing top 1 row



In [52]:
dept = dept.withColumn('dept_subject_to_SLA', expr('dept_subject_to_SLA == "YES"'))

In [45]:
case.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)



In [131]:
case.select((case.source_id.isNull()) == True).filter(case.source_id.isNull()) == True

False

In [46]:
case.show(1, truncate = False, vertical = True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 1/1/18 0:42                          
 case_closed_date     | 1/1/18 12:29                         
 SLA_due_date         | 9/26/20 0:42                         
 case_late            | NO                                   
 num_days_late        | -998.5087616000001                   
 case_closed          | YES                                  
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  EL PASO ST, San Antonio, 78207 
 council_district     | 5                                    
only showing top 1 row



In [47]:
case = case.withColumn('case_closed', expr('case_closed == "YES"'))\
.withColumn('case_late', expr('case_late == "YES"'))

In [36]:
case.show(1, truncate = False, vertical = True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 1/1/18 0:42                          
 case_closed_date     | 1/1/18 12:29                         
 SLA_due_date         | 9/26/20 0:42                         
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  EL PASO ST, San Antonio, 78207 
 council_district     | 5                                    
only showing top 1 row



In [48]:
case = case.withColumn('council_district', col('council_district').cast('string'))

In [50]:
case.show(1, truncate = False, vertical = True)

-RECORD 0----------------------------------------------------
 case_id              | 1014127332                           
 case_opened_date     | 1/1/18 0:42                          
 case_closed_date     | 1/1/18 12:29                         
 SLA_due_date         | 9/26/20 0:42                         
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 source_id            | svcCRMLS                             
 request_address      | 2315  EL PASO ST, San Antonio, 78207 
 council_district     | 5                                    
only showing top 1 row



In [54]:
fmt = "M/d/yy H:mm"

case = case.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))\
.withColumn('case_closed_date', to_timestamp('case_closed_date', fmt))\
.withColumn('SLA_due_date', to_timestamp('SLA_due_date', fmt))

## 1. How old is the latest (in terms of days past SLA) currently open issue? How long has the oldest (in terms of days since opened) currently opened issue been open?

In [60]:
case.where((case.case_closed == 'NO') & (case.num_days_late > 0)).select(min(case.num_days_late)).show(1)

+------------------+
|min(num_days_late)|
+------------------+
|       0.060810185|
+------------------+



In [61]:
case.where((case.case_closed == 'NO') & (case.num_days_late > 0)).select(max(case.num_days_late)).show(1)

+------------------+
|max(num_days_late)|
+------------------+
|       348.6458333|
+------------------+



## 2. How many Stray Animal cases are there?

In [63]:
case.where(case.service_request_type == 'Stray Animal').count()

26760

## 3. How many service requests that are assigned to the Field Operations department (dept_division) are not classified as "Officer Standby" request type (service_request_type)?

In [65]:
case.select(case.service_request_type != 'Officer Standby').where(case.dept_division == 'Field Operations').count()

116915

## .4 Convert the council_district column to a string column.

In [68]:
## already conveted to a string column
case.printSchema()

root
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: timestamp (nullable = true)
 |-- case_closed_date: timestamp (nullable = true)
 |-- SLA_due_date: timestamp (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)



## .5 Extract the year from the case_closed_date column.

In [71]:
case = case.withColumn('year', year("case_closed_date"))

In [74]:
case.select(case.year).show(5)

+----+
|year|
+----+
|2018|
|2018|
|2018|
|2018|
|2018|
+----+
only showing top 5 rows



## 6. Convert num_days_late from days to hours in new columns num_hours_late.

In [76]:
case.withColumn('num_hours_late', case.num_days_late * 24).show(5, truncate = False, vertical = True)

-RECORD 0-----------------------------------------------------
 case_id              | 1014127332                            
 case_opened_date     | 2018-01-01 00:42:00                   
 case_closed_date     | 2018-01-01 12:29:00                   
 SLA_due_date         | 2020-09-26 00:42:00                   
 case_late            | false                                 
 num_days_late        | -998.5087616000001                    
 case_closed          | true                                  
 dept_division        | Field Operations                      
 service_request_type | Stray Animal                          
 SLA_days             | 999.0                                 
 case_status          | Closed                                
 source_id            | svcCRMLS                              
 request_address      | 2315  EL PASO ST, San Antonio, 78207  
 council_district     | 5                                     
 year                 | 2018                           

## 7. Join the case data with the source and department data.

In [81]:
case_df = case.join(source, on='source_id', how='left')
case_df.show(1, truncate = False, vertical = True)

-RECORD 0----------------------------------------------------
 source_id            | svcCRMLS                             
 case_id              | 1014127332                           
 case_opened_date     | 2018-01-01 00:42:00                  
 case_closed_date     | 2018-01-01 12:29:00                  
 SLA_due_date         | 2020-09-26 00:42:00                  
 case_late            | false                                
 num_days_late        | -998.5087616000001                   
 case_closed          | true                                 
 dept_division        | Field Operations                     
 service_request_type | Stray Animal                         
 SLA_days             | 999.0                                
 case_status          | Closed                               
 request_address      | 2315  EL PASO ST, San Antonio, 78207 
 council_district     | 5                                    
 year                 | 2018                                 
 source_

In [83]:
cases_df = case_df.join(dept, on='dept_division', how='left')

In [84]:
cases_df.show(1, truncate = False, vertical = True)

-RECORD 0------------------------------------------------------
 dept_division          | Field Operations                     
 source_id              | svcCRMLS                             
 case_id                | 1014127332                           
 case_opened_date       | 2018-01-01 00:42:00                  
 case_closed_date       | 2018-01-01 12:29:00                  
 SLA_due_date           | 2020-09-26 00:42:00                  
 case_late              | false                                
 num_days_late          | -998.5087616000001                   
 case_closed            | true                                 
 service_request_type   | Stray Animal                         
 SLA_days               | 999.0                                
 case_status            | Closed                               
 request_address        | 2315  EL PASO ST, San Antonio, 78207 
 council_district       | 5                                    
 year                   | 2018          

## 8. Are there any cases that do not have a request source?

In [88]:
cases_df.where(cases_df.source_username == None).show(vertical=True)

(0 rows)



In [135]:
cases_df.select(cases_df.source_id).filter(cases_df.source_id.isNull() == True).show()

+---------+
|source_id|
+---------+
+---------+

