In [None]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

import re

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
spark

1. Read the case, department, and source data into their own spark dataframes.

In [4]:
df = spark.read.csv('data/case.csv', header=True)

In [5]:
dept = spark.read.csv('data/dept.csv', header=True)

In [6]:
source = spark.read.csv('data/source.csv', header=True)

2. Let's see how writing to the local disk works in spark:


- Write the code necessary to store the source data in both csv and json format, store these as sources_csv and sources_json
- Inspect your folder structure. What do you notice?
    > """not saved with file extentions?"""

In [7]:
source.write.csv('data/sources_csv', mode = 'overwrite')

In [8]:
source.write.json('data/sources_json', mode = 'overwrite')

3. Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

In [9]:
source.printSchema()

root
 |-- source_id: string (nullable = true)
 |-- source_username: string (nullable = true)



In [10]:
dept.printSchema()

root
 |-- dept_division: string (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)



In [11]:
df.printSchema()

root
 |-- case_id: string (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: string (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: string (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)



In [15]:
df.show(1, vertical= True)

-RECORD 0------------------------------------
 case_id              | 1014127332           
 case_opened_date     | 1/1/18 0:42          
 case_closed_date     | 1/1/18 12:29         
 SLA_due_date         | 9/26/20 0:42         
 case_late            | NO                   
 num_days_late        | -998.5087616000001   
 case_closed          | YES                  
 dept_division        | Field Operations     
 service_request_type | Stray Animal         
 SLA_days             | 999.0                
 case_status          | Closed               
 source_id            | svcCRMLS             
 request_address      | 2315  EL PASO ST,... 
 council_district     | 5                    
only showing top 1 row



In [16]:
# handle date times

fmt = "M/d/yy H:mm"

df = (
    df.withColumn('case_opened_date', to_timestamp('case_opened_date', fmt))
    .withColumn('case_closed_date', to_timestamp('case_closed_date', fmt))
    .withColumn('SLA_due_date' , to_timestamp('SLA_due_date', fmt))
)


In [25]:
# handle booleans
df = (
df.withColumn('case_late', df.case_late == "YES")
    .withColumn('case_closed', df.case_closed == "YES")
)

In [30]:
# handle numberical values

df = (
 df.withColumn('num_days_late', df.num_days_late.cast("float"))
    .withColumn('SLA_days', df.SLA_days.cast("int"))
)

In [32]:
df.printSchema()

root
 |-- case_id: string (nullable = true)
 |-- case_opened_date: timestamp (nullable = true)
 |-- case_closed_date: timestamp (nullable = true)
 |-- SLA_due_date: timestamp (nullable = true)
 |-- case_late: boolean (nullable = true)
 |-- num_days_late: float (nullable = true)
 |-- case_closed: boolean (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: integer (nullable = true)
 |-- case_status: string (nullable = true)
 |-- source_id: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: string (nullable = true)

