In [1]:
import pyspark
from pyspark.sql import SparkSession

A `SparkSession` is the **class** of the object that we instantiate

- This class is the entry point into all functionality in Spark
- A `SparkSession` can be used create DataFrames, register DataFrames as tables, execute SQL over tables, cache tables, and read Parquet files

In [2]:
# instantiate a Spark Session, an object that we use to interact with Spark
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('test') \
    .getOrCreate()

- The `builder` method is a class attribute used to construct `SparkSession` instances
- `master()` sets the Spark master URL to connect to
    - `local[*]` means Spark will run on a local cluster (local machine)
        - `[*]` means Spark will run with as many CPU cores as possible
            - i.e., This tells Spark to use all available cores (e.g., if we wanted to use only 2 cores, we would write `local[2]`)
- `appName()` defines the name of our application/session, which will show in the Spark UI at http://localhost:4040/
- `getOrCreate()` will create the session or recover the object if it was previously created
    - i.e., It returns an existing `SparkSession`, if available, or creates a new one

In [3]:
df = spark.read \
    .option('header', 'true') \
    .csv('./data/taxi+_zone_lookup.csv')

Note that similarly to pandas, Spark can read CSV files into **DataFrames**, a tabular data structure
- But *unlike* pandas, Spark can handle *much bigger* datasets
    - However, **it's unable to infer the datatypes of each column**

In [4]:
df.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [6]:
# test that writing works to a 'zones' directory
df.write.parquet('data/zones')

In [10]:
# read the fhv_tripdata_2019-01.csv file
df_fhv = spark.read \
    .option('header', 'true') \
    .option('inferSchema', True) \
    .csv('data/fhv_tripdata_2019-01.csv') # #.csv('data/fhvhv_tripdata_2021-01.csv.gz')

In [11]:
# see the FHV data
df_fhv.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00001|2019-01-01 00:30:00|2019-01-01 02:51:55|        null|        null|   null|                B00001|
|              B00001|2019-01-01 00:45:00|2019-01-01 00:54:49|        null|        null|   null|                B00001|
|              B00001|2019-01-01 00:15:00|2019-01-01 00:54:52|        null|        null|   null|                B00001|
|              B00008|2019-01-01 00:19:00|2019-01-01 00:39:00|        null|        null|   null|                B00008|
|              B00008|2019-01-01 00:27:00|2019-01-01 00:37:00|        null|        null|   null|                B00008|
+--------------------+------------------

In [12]:
# test that writing works to a 'fhv' directory
df_fhv.write.parquet('data/fhv')

In [16]:
!dir data\zones\

 Volume in drive C has no label.
 Volume Serial Number is 08A3-CF2D

 Directory of C:\Users\nimz\Documents\de_zoomcamp\week5_batch_processing\data\zones

02/23/2024  07:17 PM    <DIR>          .
02/23/2024  07:17 PM    <DIR>          ..
02/23/2024  07:17 PM                56 .part-00000-88f611ef-406d-4479-b478-72294a039c82-c000.snappy.parquet.crc
02/23/2024  07:17 PM                 8 ._SUCCESS.crc
02/23/2024  07:17 PM             5,916 part-00000-88f611ef-406d-4479-b478-72294a039c82-c000.snappy.parquet
02/23/2024  07:17 PM                 0 _SUCCESS
               4 File(s)          5,980 bytes
               2 Dir(s)  281,278,066,688 bytes free


In [17]:
!dir data\fhv\

 Volume in drive C has no label.
 Volume Serial Number is 08A3-CF2D

 Directory of C:\Users\nimz\Documents\de_zoomcamp\week5_batch_processing\data\fhv

02/23/2024  07:23 PM    <DIR>          .
02/23/2024  07:23 PM    <DIR>          ..
02/23/2024  07:23 PM           176,648 .part-00000-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.snappy.parquet.crc
02/23/2024  07:23 PM           181,088 .part-00001-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.snappy.parquet.crc
02/23/2024  07:23 PM           182,536 .part-00002-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.snappy.parquet.crc
02/23/2024  07:23 PM           179,624 .part-00003-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.snappy.parquet.crc
02/23/2024  07:23 PM           167,104 .part-00004-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.snappy.parquet.crc
02/23/2024  07:23 PM           179,588 .part-00005-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.snappy.parquet.crc
02/23/2024  07:23 PM           176,236 .part-00006-3fe1b419-cc5e-4d57-bdf3-8336fbe2d5b7-c000.

In [18]:
df_fhv.head(5)

[Row(dispatching_base_num='B00001', pickup_datetime=datetime.datetime(2019, 1, 1, 0, 30), dropOff_datetime=datetime.datetime(2019, 1, 1, 2, 51, 55), PUlocationID=None, DOlocationID=None, SR_Flag=None, Affiliated_base_number='B00001'),
 Row(dispatching_base_num='B00001', pickup_datetime=datetime.datetime(2019, 1, 1, 0, 45), dropOff_datetime=datetime.datetime(2019, 1, 1, 0, 54, 49), PUlocationID=None, DOlocationID=None, SR_Flag=None, Affiliated_base_number='B00001'),
 Row(dispatching_base_num='B00001', pickup_datetime=datetime.datetime(2019, 1, 1, 0, 15), dropOff_datetime=datetime.datetime(2019, 1, 1, 0, 54, 52), PUlocationID=None, DOlocationID=None, SR_Flag=None, Affiliated_base_number='B00001'),
 Row(dispatching_base_num='B00008', pickup_datetime=datetime.datetime(2019, 1, 1, 0, 19), dropOff_datetime=datetime.datetime(2019, 1, 1, 0, 39), PUlocationID=None, DOlocationID=None, SR_Flag=None, Affiliated_base_number='B00008'),
 Row(dispatching_base_num='B00008', pickup_datetime=datetime.dat

**Note that Spark may be unable to infer data types of columns, say our `datetime` columns here might by strings**

In [19]:
df_fhv.schema

StructType([StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', TimestampType(), True), StructField('dropOff_datetime', TimestampType(), True), StructField('PUlocationID', IntegerType(), True), StructField('DOlocationID', IntegerType(), True), StructField('SR_Flag', IntegerType(), True), StructField('Affiliated_base_number', StringType(), True)])

But it worked this time

Check the zones data too

In [21]:
df.head(5)

[Row(LocationID='1', Borough='EWR', Zone='Newark Airport', service_zone='EWR'),
 Row(LocationID='2', Borough='Queens', Zone='Jamaica Bay', service_zone='Boro Zone'),
 Row(LocationID='3', Borough='Bronx', Zone='Allerton/Pelham Gardens', service_zone='Boro Zone'),
 Row(LocationID='4', Borough='Manhattan', Zone='Alphabet City', service_zone='Yellow Zone'),
 Row(LocationID='5', Borough='Staten Island', Zone='Arden Heights', service_zone='Boro Zone')]

In [22]:
df.schema

StructType([StructField('LocationID', StringType(), True), StructField('Borough', StringType(), True), StructField('Zone', StringType(), True), StructField('service_zone', StringType(), True)])