In [6]:
# notebook dependencies
import pyspark
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import regexp_extract, regexp_replace

# note: the pyspark avg and mean functions are aliases of eachother
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean, lit

# note: the following import, imports all pyspark sql functions similar to above
from pyspark.sql.functions import *

# schema structures
from pyspark.sql.types import StructType, StructField, StringType

# creating the spark instance
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# pandas, numpy, and matplotlib imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

# pydatasets
from pydataset import data

# tqdm loading bar library
from tqdm.notebook import tqdm, trange
import time # to be used in loop iterations

# disabling warnings
# import warnings
# warnings.filterwarnings('ignore')

In [10]:
# let's first import the case dataset as a pandas df

df = pd.read_csv("/Users/mijailmariano/codeup-data-science/pyspark_exercises/data/case.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841704 entries, 0 to 841703
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   case_id               841704 non-null  int64  
 1   case_opened_date      841704 non-null  object 
 2   case_closed_date      823594 non-null  object 
 3   SLA_due_date          841671 non-null  object 
 4   case_late             841704 non-null  object 
 5   num_days_late         841671 non-null  float64
 6   case_closed           841704 non-null  object 
 7   dept_division         841704 non-null  object 
 8   service_request_type  841704 non-null  object 
 9   SLA_days              841671 non-null  float64
 10  case_status           841704 non-null  object 
 11  source_id             841704 non-null  object 
 12  request_address       841704 non-null  object 
 13  council_district      841704 non-null  int64  
dtypes: float64(2), int64(2), object(10)
memory usage: 89

In [13]:
# Read the case, department, and source data into their own spark dataframes.

file_path = "/Users/mijailmariano/codeup-data-science/pyspark_exercises/data/case.csv"

# specifiying the column schema/types (best when computational speed is a priority)
# schema = StructType([StructField("case_closed_date", StringType())])

# case dataset
case = spark.read.csv( 
    file_path,
    sep = ",",
    header = True, 
    inferSchema = True
)

# check the csv loaded correctly
case.show(2) # checks out!

                                                                                

+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|   case_id|case_opened_date|case_closed_date|SLA_due_date|case_late|      num_days_late|case_closed|   dept_division|service_request_type|   SLA_days|case_status|source_id|     request_address|council_district|
+----------+----------------+----------------+------------+---------+-------------------+-----------+----------------+--------------------+-----------+-----------+---------+--------------------+----------------+
|1014127332|     1/1/18 0:42|    1/1/18 12:29|9/26/20 0:42|       NO| -998.5087616000001|        YES|Field Operations|        Stray Animal|      999.0|     Closed| svcCRMLS|2315  EL PASO ST,...|               5|
|1014127333|     1/1/18 0:46|     1/3/18 8:11| 1/5/18 8:30|       NO|-2.0126041669999997|        YES|     Storm Water|Removal Of Obstru...|4.322222222| 

In [14]:
# dept dataset
dept = spark.read.csv( 
    "/Users/mijailmariano/codeup-data-science/pyspark_exercises/data/dept.csv",
    sep = ",",
    header = True, 
    inferSchema = True
)

# source dataset
source = spark.read.csv( 
    "/Users/mijailmariano/codeup-data-science/pyspark_exercises/data/source.csv",
    sep = ",",
    header = True, 
    inferSchema = True
)

In [17]:
# Let's see how writing to the local disk works in spark:

# Write the code necessary to store the source data in both csv and json format, store these as sources_csv and sources_json
# Inspect your folder structure. What do you notice?

case.write.csv("/Users/mijailmariano/codeup-data-science/pyspark_exercises/data/case_spark.csv", mode = "overwrite")
dept.write.json("/Users/mijailmariano/codeup-data-science/pyspark_exercises/data/dept_spark.csv", mode = "overwrite")

                                                                                

In [26]:
# column names and dtype in case dataset

case.dtypes

[('case_id', 'int'),
 ('case_opened_date', 'string'),
 ('case_closed_date', 'string'),
 ('case_due_date', 'string'),
 ('case_late', 'string'),
 ('num_days_late', 'double'),
 ('case_closed', 'string'),
 ('dept_division', 'string'),
 ('service_request_type', 'string'),
 ('SLA_days', 'double'),
 ('case_status', 'string'),
 ('source_id', 'string'),
 ('request_address', 'string'),
 ('council_district', 'int')]

In [34]:
# what do the date columns look like?

case.select("case_opened_date", "case_closed_date", "case_due_date").show(4)

+----------------+----------------+-------------+
|case_opened_date|case_closed_date|case_due_date|
+----------------+----------------+-------------+
|     1/1/18 0:42|    1/1/18 12:29| 9/26/20 0:42|
|     1/1/18 0:46|     1/3/18 8:11|  1/5/18 8:30|
|     1/1/18 0:48|     1/2/18 7:57|  1/5/18 8:30|
|     1/1/18 1:29|     1/2/18 8:13| 1/17/18 8:30|
+----------------+----------------+-------------+
only showing top 4 rows



22/09/14 02:01:38 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3858400 ms exceeds timeout 120000 ms
22/09/14 02:01:39 WARN SparkContext: Killing executors is not supported by current scheduler.


In [32]:
# Inspect the data in your dataframes. Are the data types appropriate? Write the code necessary to cast the values to the appropriate types.

case = case.withColumnRenamed("SLA_due_date", "case_due_date")

#  'casting' "no" and "yes" expressions to correct boolean data types
# note that to cast, you will need to use the 'withColumn' method
# mach expression such as 'Yes' and 'No' require additional "" (quotes) when combined with the pyspark 'expr' method
case = case.withColumn('case_closed', expr('case_closed == "YES"')).withColumn('case_late', expr('case_late == "YES"'))

case.select('case_closed', 'case_late').show(10)

+-----------+---------+
|case_closed|case_late|
+-----------+---------+
|       true|    false|
|       true|    false|
|       true|    false|
|       true|    false|
|       true|     true|
|       true|    false|
|       true|    false|
|       true|    false|
|       true|    false|
|       true|    false|
+-----------+---------+
only showing top 10 rows

