In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [3]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/03/01 12:14:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
input_path = 'data/raw/fhv/2019/10/fhv_tripdata_2019-10.csv.gz'
output_path = 'data/pq/fhv/2019/10/'

In [5]:
def get_fhv_schema():
    fhv_schema = StructType([
        StructField('dispatching_base_num', StringType(), True),
        StructField('pickup_datetime', TimestampType(), True),
        StructField('dropoff_datetime', TimestampType(), True),
        StructField('PULocationID', IntegerType(), True),
        StructField('DOLocationID', IntegerType(), True),
        StructField('SR_Flag', StringType(), True),
        StructField('Affiliated_base_number', StringType(), True),
    ])
    return fhv_schema

In [6]:
# Read the data into a pyspark dataframe
df = spark.read \
    .option('header', 'true') \
    .schema(get_fhv_schema()) \
    .csv(input_path)

df.show()

24/03/01 12:19:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: dispatching_base_num, pickup_datetime, dropOff_datetime, PUlocationID, DOlocationID, SR_Flag, Affiliated_base_number
 Schema: dispatching_base_num, pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, SR_Flag, Dispatching_base_number
Expected: Dispatching_base_number but found: Affiliated_base_number
CSV file: file:///home/mtulow/tmp/data/raw/fhv/2019/10/fhv_tripdata_2019-10.csv.gz
+--------------------+-------------------+-------------------+------------+------------+-------+-----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Dispatching_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+-----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   null|                 B00009|
|              B00013|2019-1

                                                                                

In [None]:
# Repartition dataframe then,
# Write to parquet
df.write \
    .repartition(6) \
    .parquet(output_path)


