In [4]:
# https://data.sfgov.org/Public-Safety/Fire-Department-Calls-for-Service/nuek-vuh3
# https://www.databricks.com/notebooks/gallery/SanFranciscoFireCallsAnalysis.html
# Download aqui: # https://raw.githubusercontent.com/databricks/LearningSparkV2/master/chapter3/data/sf-fire-calls.csv

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Aula 3.2") \
    .getOrCreate()

spark.version

'3.2.1'

In [5]:
fire_df = spark.read.csv('sf-fire-calls.csv', header=True, inferSchema=True)

fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [6]:
fire_df.head(5)

[Row(CallNumber=20110016, UnitID='T13', IncidentNumber=2003235, CallType='Structure Fire', CallDate='01/11/2002', WatchDate='01/10/2002', CallFinalDisposition='Other', AvailableDtTm='01/11/2002 01:51:44 AM', Address='2000 Block of CALIFORNIA ST', City='SF', Zipcode=94109, Battalion='B04', StationArea='38', Box='3362', OriginalPriority='3', Priority='3', FinalPriority=3, ALSUnit=False, CallTypeGroup=None, NumAlarms=1, UnitType='TRUCK', UnitSequenceInCallDispatch=2, FirePreventionDistrict='4', SupervisorDistrict='5', Neighborhood='Pacific Heights', Location='(37.7895840679362, -122.428071912459)', RowID='020110016-T13', Delay=2.95),
 Row(CallNumber=20110022, UnitID='M17', IncidentNumber=2003241, CallType='Medical Incident', CallDate='01/11/2002', WatchDate='01/10/2002', CallFinalDisposition='Other', AvailableDtTm='01/11/2002 03:01:18 AM', Address='0 Block of SILVERVIEW DR', City='SF', Zipcode=94124, Battalion='B10', StationArea='42', Box='6495', OriginalPriority='3', Priority='3', FinalP

In [7]:
fire_df.count()

175296

In [8]:
fire_df.select('CallType').distinct().show()

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|         Marine Fire|
|  Aircraft Emergency|
|      Administrative|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|Watercraft in Dis...|
|           Explosion|
|           Oil Spill|
|        Vehicle Fire|
|  Suspicious Package|
|Extrication / Ent...|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
+--------------------+
only showing top 20 rows



In [None]:
fire_df.select('CallType').distinct().count()

In [9]:
from pyspark.sql.functions import col

fire_df.groupBy('CallType').count().sort(col("count").desc()).show()

+--------------------+------+
|            CallType| count|
+--------------------+------+
|    Medical Incident|113794|
|      Structure Fire| 23319|
|              Alarms| 19406|
|   Traffic Collision|  7013|
|Citizen Assist / ...|  2524|
|               Other|  2166|
|        Outside Fire|  2094|
|        Vehicle Fire|   854|
|Gas Leak (Natural...|   764|
|        Water Rescue|   755|
|Odor (Strange / U...|   490|
|   Electrical Hazard|   482|
|Elevator / Escala...|   453|
|Smoke Investigati...|   391|
|          Fuel Spill|   193|
|              HazMat|   124|
|Industrial Accidents|    94|
|           Explosion|    89|
|Train / Rail Inci...|    57|
|  Aircraft Emergency|    36|
+--------------------+------+
only showing top 20 rows



In [10]:
fire_df.groupBy('CallType', 'UnitType').count().sort(col("count").desc()).show()

+--------------------+--------------+-----+
|            CallType|      UnitType|count|
+--------------------+--------------+-----+
|    Medical Incident|         MEDIC|50614|
|    Medical Incident|        ENGINE|44259|
|    Medical Incident|       PRIVATE|10297|
|      Structure Fire|        ENGINE| 8840|
|      Structure Fire|         TRUCK| 6815|
|              Alarms|        ENGINE| 6641|
|              Alarms|         TRUCK| 6396|
|              Alarms|         CHIEF| 6311|
|      Structure Fire|         CHIEF| 4548|
|    Medical Incident|RESCUE CAPTAIN| 3582|
|   Traffic Collision|        ENGINE| 2873|
|    Medical Incident|         TRUCK| 2582|
|   Traffic Collision|         MEDIC| 2489|
|        Outside Fire|        ENGINE| 1629|
|Citizen Assist / ...|        ENGINE| 1410|
|      Structure Fire|         MEDIC| 1406|
|      Structure Fire|  RESCUE SQUAD| 1320|
|    Medical Incident|  RESCUE SQUAD| 1182|
|               Other|        ENGINE|  931|
|Citizen Assist / ...|         T

In [11]:
fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")

fire_df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

In [14]:
fire_df.select("CallNumber","ResponseDelayedinMins").where(col("ResponseDelayedinMins") > 15).show(5)


+----------+---------------------+
|CallNumber|ResponseDelayedinMins|
+----------+---------------------+
|  20150265|             95.28333|
|  20230229|            15.966666|
|  20240272|            42.583332|
|  20290468|                41.25|
|  20310211|                 43.0|
+----------+---------------------+
only showing top 5 rows



In [15]:
fire_df \
 .select("CallType", "ZipCode")  \
 .where(col("CallType").isNotNull()) \
 .groupBy("CallType", "Zipcode") \
 .count() \
 .orderBy("count", ascending=False) \
 .show()

+----------------+-------+-----+
|        CallType|Zipcode|count|
+----------------+-------+-----+
|Medical Incident|  94102|16130|
|Medical Incident|  94103|14775|
|Medical Incident|  94110| 9995|
|Medical Incident|  94109| 9479|
|Medical Incident|  94124| 5885|
|Medical Incident|  94112| 5630|
|Medical Incident|  94115| 4785|
|Medical Incident|  94122| 4323|
|Medical Incident|  94107| 4284|
|Medical Incident|  94133| 3977|
|Medical Incident|  94117| 3522|
|Medical Incident|  94134| 3437|
|Medical Incident|  94114| 3225|
|Medical Incident|  94118| 3104|
|Medical Incident|  94121| 2953|
|Medical Incident|  94116| 2738|
|Medical Incident|  94132| 2594|
|  Structure Fire|  94110| 2267|
|Medical Incident|  94105| 2258|
|  Structure Fire|  94102| 2229|
+----------------+-------+-----+
only showing top 20 rows

