# Section 1: Exploration Operations

In [None]:
%%scala

# display container contents
display(dbutils.fs.ls("mnt/DatalakeGen2Storage/"))

In [None]:
%%scala

# display file sample
dbutils.fs.head("mnt/DatalakeGen2Storage/YellowTaxiTripData.csv")

In [None]:
%%scala

# create and display taxi dataframe
var yellowTaxiTripDataDF = spark
    .read
    .option("header", "true")    
    .csv("/mnt/DatalakeGen2Storage/YellowTaxiTripData.csv")

display(yellowTaxiTripDataDF)

# Section 2: Analyze Data

In [None]:
%%scala

# display statistics on passenger_count and trip_distance
display(
    yellowTaxiTripDataDF.describe(
        "passenger_count",                                     
        "trip_distance"                                     
    )
)

# Section 3: Clean Data

In [None]:
%%scala

# display the count before filtering
println(
    "Before Filter: " + yellowTaxiTripDataDF.count()
)

# filter inaccurate data
yellowTaxiTripDataDF = yellowTaxiTripDataDF
    .where("passenger_count > 0")
    .filter($"trip_distance" > 0.0)

# display the count after filtering
println(
    "After Filter: " + yellowTaxiTripDataDF.count()
)

In [None]:
%%scala

# display the count before filtering
println(
    "Before Filter: " + yellowTaxiTripDataDF.count()
)

# drop rows with nulls in PULocationID or DOLocationID
yellowTaxiTripDataDF = yellowTaxiTripDataDF
  .na.drop(
    Seq("PULocationID", "DOLocationID")
  )

# display the count after filtering
println(
    "After Filter: " + yellowTaxiTripDataDF.count()
)

# Section 4: Transform Data

In [None]:
%%scala

# rename the columns
yellowTaxiTripDataDF = yellowTaxiTripDataDF                                                
    .withColumnRenamed("PUlocationID", "PickupLocationId")
    .withColumnRenamed("DOlocationID", "DropLocationId")       

# print schema
yellowTaxiTripDataDF.printSchema

# Section 5: Load Data

In [None]:
%%scala

# load the dataframe as CSV to data lake
yellowTaxiTripDataDF  
    .write
    .option("header", "true")
    .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    .mode(SaveMode.Overwrite)
    .csv("/mnt/DatalakeGen2Storage/ProcessedTaxiData/YellowTaxiData.csv")

In [None]:
%%scala

# load the dataframe as parquet to data lake
yellowTaxiTripDataDF  
    .write
    .option("header", "true")
    .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    .mode(SaveMode.Overwrite)
    .parquet("/mnt/DatalakeGen2Storage/ProcessedTaxiData/YellowTaxiData.parquet")