In [1]:
from pyspark.context import SparkContext

In [2]:
sc = SparkContext(master='local',
    appName='RDD-Test',
    sparkHome=None,
    pyFiles=None,
    environment=None,
    batchSize=0,
    conf=None,
    gateway=None,
    jsc=None)

22/11/25 15:16:22 WARN Utils: Your hostname, Pauls-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.90 instead (on interface en0)
22/11/25 15:16:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/25 15:16:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/11/25 15:16:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# In Python
# Create an RDD of tuples (name, age)
dataRDD = sc.parallelize([("Brooke", 20), ("Denny", 31), ("Jules", 30), 
  ("TD", 35), ("Brooke", 25)])
# Use map and reduceByKey transformations with their lambda 
# expressions to aggregate and then compute average

agesRDD = (dataRDD
  .map(lambda x: (x[0], (x[1], 1)))
  .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
  .map(lambda x: (x[0], x[1][0]/x[1][1])))

In [4]:
agesRDD.collect()

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

[('Brooke', 22.5), ('Denny', 31.0), ('Jules', 30.0), ('TD', 35.0)]

In [5]:
# In Python 
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
# Create a DataFrame using SparkSession
spark = (SparkSession
  .builder
  .appName("AuthorsAges")
  .getOrCreate())
# Create a DataFrame 
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30), 
  ("TD", 35), ("Brooke", 25)], schema=["name", "age"])
# Group the same names together, aggregate their ages, and compute an average
avg_df = data_df.groupBy("name").agg(avg("age").alias("avg_age"))
# Show the results of the final execution
avg_df.show()

+------+-------+
|  name|avg_age|
+------+-------+
|Brooke|   22.5|
| Jules|   30.0|
|    TD|   35.0|
| Denny|   31.0|
+------+-------+



### Scala example, does same thing as above (in Python)

```scala
import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession
// Create a DataFrame using SparkSession
val spark = SparkSession
  .builder
  .appName("AuthorsAges")
  .getOrCreate()
// Create a DataFrame of names and ages
val dataDF = spark.createDataFrame(Seq(("Brooke", 20), ("Brooke", 25), 
  ("Denny", 31), ("Jules", 30), ("TD", 35))).toDF("name", "age")
// Group the same names together, aggregate their ages, and compute an average
val avgDF = dataDF.groupBy("name").agg(avg("age"))
// Show the results of the final execution
avgDF.show()
```

In [6]:
# Define schema for our data using DDL 
schema = """`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"""

In [7]:
spark = SparkSession.builder.appName("Example-3_7").getOrCreate()


22/11/25 15:16:26 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [8]:
ls data -alh

ls: -alh: No such file or directory
data:
blogs.json         sf-fire-calls.csv


In [9]:
blogs_df = spark.read.schema(schema).json("data/blogs.json")

In [10]:
blogs_df.show(5)

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
+---+---------+-------+-----------------+---------+-----+--------------------+
only showing top 5 rows



In [11]:
blogs_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [12]:
type(blogs_df)

pyspark.sql.dataframe.DataFrame

In [13]:
from pyspark.sql.functions import mean, round

In [14]:
spark.read.json("data/blogs.json").show()

+--------------------+---------+-----+---+-------+---------+-----------------+
|           Campaigns|    First| Hits| Id|   Last|Published|              Url|
+--------------------+---------+-----+---+-------+---------+-----------------+
| [twitter, LinkedIn]|    Jules| 4535|  1|  Damji| 1/4/2016|https://tinyurl.1|
| [twitter, LinkedIn]|   Brooke| 8908|  2|  Wenig| 5/5/2018|https://tinyurl.2|
|[web, twitter, FB...|    Denny| 7659|  3|    Lee| 6/7/2019|https://tinyurl.3|
|       [twitter, FB]|Tathagata|10568|  4|    Das|5/12/2018|https://tinyurl.4|
|[web, twitter, FB...|    Matei|40578|  5|Zaharia|5/14/2014|https://tinyurl.5|
| [twitter, LinkedIn]|  Reynold|25568|  6|    Xin| 3/2/2015|https://tinyurl.6|
+--------------------+---------+-----+---+-------+---------+-----------------+



In [15]:
spark.read.json("data/blogs.json").printSchema()

root
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- First: string (nullable = true)
 |-- Hits: long (nullable = true)
 |-- Id: long (nullable = true)
 |-- Last: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Url: string (nullable = true)



In [16]:
blogs_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



Pretty close (almost the same) for infering the data types just reading the JSON file

In [17]:
blogs_df.agg(round(mean('Hits'),3)).show()

+-------------------+
|round(avg(Hits), 3)|
+-------------------+
|          16302.667|
+-------------------+



In [18]:
from pyspark.sql.functions import expr, column

In [19]:
blogs_df.select("Hits", expr("Hits * 2")).show()

+-----+----------+
| Hits|(Hits * 2)|
+-----+----------+
| 4535|      9070|
| 8908|     17816|
| 7659|     15318|
|10568|     21136|
|40578|     81156|
|25568|     51136|
+-----+----------+



In [20]:
blogs_df.select(column("Hits"), column("Hits") * 2).show()

+-----+----------+
| Hits|(Hits * 2)|
+-----+----------+
| 4535|      9070|
| 8908|     17816|
| 7659|     15318|
|10568|     21136|
|40578|     81156|
|25568|     51136|
+-----+----------+



In [21]:
# This adds a new column, Big Hitters, based on the conditional expression
blogs_df.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [22]:
blogs_df.sort("Hits", ascending=False).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



# Rows

In [23]:
# In Python
from pyspark.sql import Row


blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", ["twitter", "LinkedIn"])
# access using index for individual items
blog_row[1]

'Reynold'

In [24]:
blog_row[3]

'https://tinyurl.6'

In [25]:
# In Python, define a schema 
from pyspark.sql.types import *

# Programmatic way to define a schema 
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
                StructField('UnitID', StringType(), True),
                StructField('IncidentNumber', IntegerType(), True),
                StructField('CallType', StringType(), True),                  
                StructField('CallDate', StringType(), True),      
                StructField('WatchDate', StringType(), True),
                StructField('CallFinalDisposition', StringType(), True),
                StructField('AvailableDtTm', StringType(), True),
                StructField('Address', StringType(), True),       
                StructField('City', StringType(), True),       
                StructField('Zipcode', IntegerType(), True),       
                StructField('Battalion', StringType(), True),                 
                StructField('StationArea', StringType(), True),       
                StructField('Box', StringType(), True),       
                StructField('OriginalPriority', StringType(), True),       
                StructField('Priority', StringType(), True),       
                StructField('FinalPriority', IntegerType(), True),       
                StructField('ALSUnit', BooleanType(), True),       
                StructField('CallTypeGroup', StringType(), True),
                StructField('NumAlarms', IntegerType(), True),
                StructField('UnitType', StringType(), True),
                StructField('UnitSequenceInCallDispatch', IntegerType(), True),
                StructField('FirePreventionDistrict', StringType(), True),
                StructField('SupervisorDistrict', StringType(), True),
                StructField('Neighborhood', StringType(), True),
                StructField('Location', StringType(), True),
                StructField('RowID', StringType(), True),
                StructField('Delay', FloatType(), True)])

# Use the DataFrameReader interface to read a CSV file
sf_fire_file = "data/sf-fire-calls.csv"
fire_df = spark.read.csv(sf_fire_file, header=True, sep= ",",schema=fire_schema)

In [26]:
fire_df.show(2, vertical=True, truncate=False)

22/11/25 15:16:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
-RECORD 0-----------------------------------------------------------
 CallNumber                 | 20110016                              
 UnitID                     | T13                                   
 IncidentNumber             | 2003235                               
 CallType                   | Structure Fire                        
 CallDate                   | 01/11/2002                            
 WatchDate                  | 01/10/2002                            
 CallFinalDisposition       | Other                                 
 AvailableDtTm              | 01/11/2002 01:51:44 AM                
 Address                    | 2000 Block of CALIFORNIA ST           
 City                       | SF                                    
 Zipcode                    | 94109                          

In [27]:
fire_df.select("IncidentNumber", "CallType", "Battalion", "Neighborhood", "Delay").show()

+--------------+----------------+---------+--------------------+---------+
|IncidentNumber|        CallType|Battalion|        Neighborhood|    Delay|
+--------------+----------------+---------+--------------------+---------+
|       2003235|  Structure Fire|      B04|     Pacific Heights|     2.95|
|       2003241|Medical Incident|      B10|Bayview Hunters P...|      4.7|
|       2003242|Medical Incident|      B03|          Tenderloin|2.4333334|
|       2003250|    Vehicle Fire|      B06|      Bernal Heights|      1.5|
|       2003259|          Alarms|      B04|    Western Addition|3.4833333|
|       2003279|  Structure Fire|      B03|Financial Distric...|     1.75|
|       2003301|          Alarms|      B09|Oceanview/Merced/...|2.7166667|
|       2003304|          Alarms|      B02|          Tenderloin|1.7833333|
|       2003343|Medical Incident|      B04|           Japantown|1.5166667|
|       2003348|Medical Incident|      B05| Castro/Upper Market|2.7666667|
|       2003381|Medical I

In [28]:
# In Python
few_fire_df = (fire_df
  .select("IncidentNumber", "AvailableDtTm", "CallType") 
  .where(column("CallType") != "Medical Incident"))
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [29]:
fire_df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'Zipcode',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumAlarms',
 'UnitType',
 'UnitSequenceInCallDispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'Neighborhood',
 'Location',
 'RowID',
 'Delay']

In [30]:
(fire_df.select('CallType', 'CallDate', 'WatchDate', 'CallFinalDisposition', 'Neighborhood')
 .where(column('CallType') == "Structure Fire").show())

+--------------+----------+----------+--------------------+--------------------+
|      CallType|  CallDate| WatchDate|CallFinalDisposition|        Neighborhood|
+--------------+----------+----------+--------------------+--------------------+
|Structure Fire|01/11/2002|01/10/2002|               Other|     Pacific Heights|
|Structure Fire|01/11/2002|01/11/2002|               Other|Financial Distric...|
|Structure Fire|01/11/2002|01/11/2002|               Other|           Excelsior|
|Structure Fire|01/11/2002|01/11/2002|               Other|             Mission|
|Structure Fire|01/11/2002|01/11/2002|               Other|             Mission|
|Structure Fire|01/11/2002|01/11/2002|               Other|             Mission|
|Structure Fire|01/11/2002|01/11/2002|               Other|          Tenderloin|
|Structure Fire|01/12/2002|01/11/2002|               Other|     South of Market|
|Structure Fire|01/12/2002|01/12/2002|               Other|          Tenderloin|
|Structure Fire|01/12/2002|0

In [31]:
fire_df.groupby('CallType').count().sort('Count', ascending=False).show()

+--------------------+------+
|            CallType| count|
+--------------------+------+
|    Medical Incident|113794|
|      Structure Fire| 23319|
|              Alarms| 19406|
|   Traffic Collision|  7013|
|Citizen Assist / ...|  2524|
|               Other|  2166|
|        Outside Fire|  2094|
|        Vehicle Fire|   854|
|Gas Leak (Natural...|   764|
|        Water Rescue|   755|
|Odor (Strange / U...|   490|
|   Electrical Hazard|   482|
|Elevator / Escala...|   453|
|Smoke Investigati...|   391|
|          Fuel Spill|   193|
|              HazMat|   124|
|Industrial Accidents|    94|
|           Explosion|    89|
|Train / Rail Inci...|    57|
|  Aircraft Emergency|    36|
+--------------------+------+
only showing top 20 rows



## Renaming, adding, and dropping columns

In [32]:
# In Python
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
  .select("ResponseDelayedinMins")
  .where(column("ResponseDelayedinMins") > 5)
  .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [33]:
fire_df.columns

['CallNumber',
 'UnitID',
 'IncidentNumber',
 'CallType',
 'CallDate',
 'WatchDate',
 'CallFinalDisposition',
 'AvailableDtTm',
 'Address',
 'City',
 'Zipcode',
 'Battalion',
 'StationArea',
 'Box',
 'OriginalPriority',
 'Priority',
 'FinalPriority',
 'ALSUnit',
 'CallTypeGroup',
 'NumAlarms',
 'UnitType',
 'UnitSequenceInCallDispatch',
 'FirePreventionDistrict',
 'SupervisorDistrict',
 'Neighborhood',
 'Location',
 'RowID',
 'Delay']

In [34]:
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
  .select("ResponseDelayedinMins")
  .where(column("ResponseDelayedinMins") > 5)
  .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



In [35]:
from pyspark.sql.functions import to_timestamp, to_date, year

In [36]:
# In Python
fire_ts_df = (new_fire_df
  .withColumn("IncidentDate", to_timestamp(column("CallDate"), "MM/dd/yyyy")).drop("CallDate") 
  .withColumn("OnWatchDate", to_date(column("WatchDate"), "MM/dd/yyyy")).drop("WatchDate") 
  .withColumn("AvailableDtTS", to_timestamp(column("AvailableDtTm"), "MM/dd/yyyy hh:mm:ss a")).drop("AvailableDtTm"))

# Select the converted columns
(fire_ts_df
  .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
  .show(5, False))

+-------------------+-----------+-------------------+
|IncidentDate       |OnWatchDate|AvailableDtTS      |
+-------------------+-----------+-------------------+
|2002-01-11 00:00:00|2002-01-10 |2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 |2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 |2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 |2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 |2002-01-11 06:01:58|
+-------------------+-----------+-------------------+
only showing top 5 rows



In [37]:
# In Python
(fire_ts_df
  .select(year('IncidentDate'))
  .distinct()
  .orderBy(year('IncidentDate'))
  .show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [38]:
fire_ts_df.groupby(year('IncidentDate')).count().sort('year(IncidentDate)', ascending=False).show()

+------------------+-----+
|year(IncidentDate)|count|
+------------------+-----+
|              2018|10136|
|              2017|12135|
|              2016|11609|
|              2015|11458|
|              2014|10775|
|              2013|10020|
|              2012| 9674|
|              2011| 9735|
|              2010| 9341|
|              2009| 8789|
|              2008| 8869|
|              2007| 8255|
|              2006| 8174|
|              2005| 8282|
|              2004| 8283|
|              2003| 8499|
|              2002| 8090|
|              2001| 7713|
|              2000| 5459|
+------------------+-----+



#### Let’s take our first question: what were the most common types of fire calls?

In [39]:
# In Python
(fire_ts_df
  .select("CallType")
  .where(column("CallType").isNotNull())
  .groupBy("CallType")
  .count()
  .orderBy("count", ascending=False)
  .show(n=10, truncate=False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [40]:
fire_ts_df.show(1, vertical=True)

-RECORD 0------------------------------------------
 CallNumber                 | 20110016             
 UnitID                     | T13                  
 IncidentNumber             | 2003235              
 CallType                   | Structure Fire       
 CallFinalDisposition       | Other                
 Address                    | 2000 Block of CAL... 
 City                       | SF                   
 Zipcode                    | 94109                
 Battalion                  | B04                  
 StationArea                | 38                   
 Box                        | 3362                 
 OriginalPriority           | 3                    
 Priority                   | 3                    
 FinalPriority              | 3                    
 ALSUnit                    | false                
 CallTypeGroup              | null                 
 NumAlarms                  | 1                    
 UnitType                   | TRUCK                
 UnitSequenc

In [41]:
subset_col = [
 'CallType',
 'CallFinalDisposition',
 'City',
 'Zipcode',
 'Battalion',
 'NumAlarms',
 'UnitType',
 'Neighborhood',
 'Location',
 'ResponseDelayedinMins',
 'IncidentDate']

In [42]:
fire_ts_df.select(subset_col).show(5, vertical=True)

-RECORD 0-------------------------------------
 CallType              | Structure Fire       
 CallFinalDisposition  | Other                
 City                  | SF                   
 Zipcode               | 94109                
 Battalion             | B04                  
 NumAlarms             | 1                    
 UnitType              | TRUCK                
 Neighborhood          | Pacific Heights      
 Location              | (37.7895840679362... 
 ResponseDelayedinMins | 2.95                 
 IncidentDate          | 2002-01-11 00:00:00  
-RECORD 1-------------------------------------
 CallType              | Medical Incident     
 CallFinalDisposition  | Other                
 City                  | SF                   
 Zipcode               | 94124                
 Battalion             | B10                  
 NumAlarms             | 1                    
 UnitType              | MEDIC                
 Neighborhood          | Bayview Hunters P... 
 Location    

### Other Common DataFrame Operations

In [43]:
import pyspark.sql.functions as F

fire_ts_df.select(F.sum('NumAlarms'), 
                  F.avg('ResponseDelayedinMins'), 
                  F.min('ResponseDelayedinMins'),
                  F.max('ResponseDelayedinMins')).show()

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



In [44]:
F.st

AttributeError: module 'pyspark.sql.functions' has no attribute 'st'