## Taxi Queries on a Single DF from CSV

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
from pyspark.sql.types import *

In [3]:
taxi_file_loc="abfs:///taxi_data/yellow_tripdata_2019-12.csv"
taxi_file_dir="/taxi_data/"
taxi_output_dir="/taxi_data_year_count"
schema_cols = "VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge".split(',')
bad_recordset_path="/taxi_error"
checkpoint_loc="/checkpoints"

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Define schema ahead of time for perf

In [4]:
taxi_orig_schema = (
    StructType()
    .add("VendorID", IntegerType())
    .add("tpep_pickup_datetime", TimestampType())
    .add("tpep_dropoff_datetime", TimestampType())
    .add("passenger_count", IntegerType())
    .add("trip_distance", DoubleType())
    .add("RatecodeID", IntegerType())
    .add("store_and_fwd_flag", StringType())
    .add("PULocationID", IntegerType())
    .add("DOLocationID", IntegerType())
    .add("payment_type", IntegerType())
    .add("fare_amount", DoubleType())
    .add("extra", DoubleType())
    .add("mta_tax", DoubleType())
    .add("tip_amount", DoubleType())
    .add("tolls_amount", DoubleType())
    .add("improvement_surcharge", DoubleType())
    .add("total_amount", DoubleType())
    .add("congestion_surcharge", DoubleType())
)

taxi_clean_chema = (
    StructType()
    .add("tpep_pickup_datetime", TimestampType())
    .add("tpep_dropoff_datetime", TimestampType())
    .add("passenger_count", IntegerType())
    .add("trip_distance", DoubleType())
    .add("payment_type", IntegerType())
    .add("fare_amount", DoubleType())
    .add("extra", DoubleType())
    .add("mta_tax", DoubleType())
    .add("tip_amount", DoubleType())
    .add("tolls_amount", DoubleType())
    .add("improvement_surcharge", DoubleType())
    .add("total_amount", DoubleType())
    .add("congestion_surcharge", DoubleType())
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Define Input DF

In [5]:
taxi_input_df = (
    spark
    .readStream
    .option("maxFilesPerTrigger", 1)
    .schema(taxi_orig_schema)
    .option("badRecordsPath", bad_recordset_path) #any bad records will go here
    .option("header", "false")
    .csv(taxi_file_dir) #the source
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
query= (
    taxi_input_df
    .writeStream
    .format("parquet")
    .option("path", taxi_output_dir)
    .option("checkpointLocation", checkpoint_loc)
    .outputMode("append")
    .option("startingOffsets", "earliest")
    .queryName("taxi")
#    .trigger(processingTime='5 seconds')
    .start()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
query.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Define Output DF (query)

### Read it

## Run Queries

In [6]:
taxi_input_df.createOrReplaceTempView('nyyellowtaxi')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
import uuid

def write_batch_df(batch_df, epoch_id):
    try:
        batch_df.write.csv('{}/taxi_{}'.format(taxi_output_dir, epoch_id), mode='ignore')
    except:
        pass

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
q0 = (
    spark.sql("SELECT passenger_count, \
    year(tpep_pickup_datetime) year, count(*) total FROM nyyellowtaxi GROUP BY passenger_count, year(tpep_pickup_datetime)")
    .writeStream
    .outputMode("complete")
    .foreachBatch(write_batch_df)
    .option("startingOffsets", "earliest")
    .queryName("year_count")
    .trigger(processingTime='20 seconds')    
    .start()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
q0.status

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{u'message': u'Processing new data', u'isTriggerActive': True, u'isDataAvailable': True}

In [6]:
amount = (
taxi_input_df
.filter(F.col("fare_amount") > 11.0)
.writeStream
.format("memory")
.queryName("query4")
.outputMode("append")
.start()   
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
amount.status

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

{u'message': u'Processing new data', u'isTriggerActive': True, u'isDataAvailable': True}

In [None]:
%%sql
select * from query4

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [70]:
avg_fare_psg_count = spark.sql("SELECT passenger_count, \
                                avg(total_amount) avg_fare FROM nyyellowtaxi GROUP BY passenger_count")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [71]:
avg_fare_psg_count

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[passenger_count: int, avg_fare: double]

In [None]:
psg_year_count = spark.sql("SELECT passenger_count, \
                    year(tpep_pickup_datetime) trip_year, \
                    round(trip_distance) distance, \
                    count(*) trips \
                    FROM nyyellowtaxi \
                    GROUP BY passenger_count, \
                        year(tpep_pickup_datetime), round(trip_distance) ORDER BY trip_year, trips desc")

In [None]:
psg_year_count.show()