In [1]:
# https://data.sfgov.org/Public-Safety/Fire-Department-Calls-for-Service/nuek-vuh3
# https://www.databricks.com/notebooks/gallery/SanFranciscoFireCallsAnalysis.html
# Download aqui: # https://raw.githubusercontent.com/databricks/LearningSparkV2/master/chapter3/data/sf-fire-calls.csv

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Aula 4.2") \
    .getOrCreate()

spark.version

'3.2.1'

In [2]:
fire_df = spark.read.csv('sf-fire-calls.csv', header=True, inferSchema=True)

fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [3]:
fire_df.head(5)

[Row(CallNumber=20110016, UnitID='T13', IncidentNumber=2003235, CallType='Structure Fire', CallDate='01/11/2002', WatchDate='01/10/2002', CallFinalDisposition='Other', AvailableDtTm='01/11/2002 01:51:44 AM', Address='2000 Block of CALIFORNIA ST', City='SF', Zipcode=94109, Battalion='B04', StationArea='38', Box='3362', OriginalPriority='3', Priority='3', FinalPriority=3, ALSUnit=False, CallTypeGroup=None, NumAlarms=1, UnitType='TRUCK', UnitSequenceInCallDispatch=2, FirePreventionDistrict='4', SupervisorDistrict='5', Neighborhood='Pacific Heights', Location='(37.7895840679362, -122.428071912459)', RowID='020110016-T13', Delay=2.95),
 Row(CallNumber=20110022, UnitID='M17', IncidentNumber=2003241, CallType='Medical Incident', CallDate='01/11/2002', WatchDate='01/10/2002', CallFinalDisposition='Other', AvailableDtTm='01/11/2002 03:01:18 AM', Address='0 Block of SILVERVIEW DR', City='SF', Zipcode=94124, Battalion='B10', StationArea='42', Box='6495', OriginalPriority='3', Priority='3', FinalP

In [4]:
fire_df.count()

175296

In [5]:
fire_df.createOrReplaceTempView("fire_table")

spark.sql("SELECT COUNT(*) FROM fire_table").show()


+--------+
|count(1)|
+--------+
|  175296|
+--------+



In [6]:
fire_df.select('CallType').distinct().show()

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|Watercraft in Dis...|
|           Explosion|
|           Oil Spill|
|        Vehicle Fire|
|  Suspicious Package|
|Extrication / Ent...|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
+--------------------+
only showing top 20 rows



In [8]:
spark.sql("SELECT distinct(CallType) FROM fire_table").show()

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|Watercraft in Dis...|
|           Explosion|
|           Oil Spill|
|        Vehicle Fire|
|  Suspicious Package|
|Extrication / Ent...|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
+--------------------+
only showing top 20 rows



In [9]:
from pyspark.sql.functions import col

fire_df.groupBy('CallType').count().sort(col("count").desc()).show()

+--------------------+------+
|            CallType| count|
+--------------------+------+
|    Medical Incident|113794|
|      Structure Fire| 23319|
|              Alarms| 19406|
|   Traffic Collision|  7013|
|Citizen Assist / ...|  2524|
|               Other|  2166|
|        Outside Fire|  2094|
|        Vehicle Fire|   854|
|Gas Leak (Natural...|   764|
|        Water Rescue|   755|
|Odor (Strange / U...|   490|
|   Electrical Hazard|   482|
|Elevator / Escala...|   453|
|Smoke Investigati...|   391|
|          Fuel Spill|   193|
|              HazMat|   124|
|Industrial Accidents|    94|
|           Explosion|    89|
|Train / Rail Inci...|    57|
|  Aircraft Emergency|    36|
+--------------------+------+
only showing top 20 rows



In [10]:
spark.sql("SELECT CallType, COUNT(*) AS COUNT from fire_table  \
            GROUP BY CallType ORDER BY COUNT DESC").show()

+--------------------+------+
|            CallType| COUNT|
+--------------------+------+
|    Medical Incident|113794|
|      Structure Fire| 23319|
|              Alarms| 19406|
|   Traffic Collision|  7013|
|Citizen Assist / ...|  2524|
|               Other|  2166|
|        Outside Fire|  2094|
|        Vehicle Fire|   854|
|Gas Leak (Natural...|   764|
|        Water Rescue|   755|
|Odor (Strange / U...|   490|
|   Electrical Hazard|   482|
|Elevator / Escala...|   453|
|Smoke Investigati...|   391|
|          Fuel Spill|   193|
|              HazMat|   124|
|Industrial Accidents|    94|
|           Explosion|    89|
|Train / Rail Inci...|    57|
|  Aircraft Emergency|    36|
+--------------------+------+
only showing top 20 rows



In [11]:
fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayinMins")
fire_df.createOrReplaceTempView("fire_table")


spark.sql("SELECT CallNumber, ResponseDelayInMins FROM fire_table \
            WHERE ResponseDelayInMins > 50").show()



+----------+-------------------+
|CallNumber|ResponseDelayInMins|
+----------+-------------------+
|  20150265|           95.28333|
|  20510101|           79.98333|
|  20750413|          336.33334|
|  20750413|          330.66666|
|  21730011|              59.65|
|  21780097|              54.45|
|  23110482|             106.05|
|  30330258|           65.28333|
|  30640355|           67.03333|
|  30740040|          316.58334|
|  30800133|          628.61664|
|  30910242|          131.93333|
|  32170203|          115.48333|
|  32520252|          56.033333|
|  52780360|          53.083332|
|  60690316|           99.63333|
|  60760172|              75.95|
|  82100052|               84.6|
|  82410212|          91.916664|
|  82490156|          385.58334|
+----------+-------------------+
only showing top 20 rows

