# New York Taxi Data Analysis 
Group-9

## Setup

In [None]:
# generic modules
import itertools
import os
import re
import timeit
import gc

# specific module
#import wget

# common ds modules
import pandas as pd
#import plotly.express as px

# spark modules for session managment
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

# spark functions
from pyspark.sql.functions import lit
import pyspark.sql.functions as sparkle

# spark types
from pyspark.sql.types import *

# spark ml
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# session starter named nyctaxi
spark=SparkSession.builder \
    .appName('nyctaxi') \
    .master('local[*]') \
    .config('spark.driver.memory','10G') \
    .getOrCreate()


#     .config("spark.sql.default.parallelism", "360") \ 
'''
.config("spark.driver.maxResultSize", "8g") \
    
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "150000") \
    .config("spark.sql.tungsten.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "360") \
    .config("spark.rdd.compress", "true") \
'''

'\n.config("spark.driver.maxResultSize", "8g")     \n    .config("spark.sql.execution.arrow.pyspark.enabled", "true")     .config("spark.sql.execution.arrow.maxRecordsPerBatch", "150000")     .config("spark.sql.tungsten.enabled", "true")     .config("spark.sql.shuffle.partitions", "360")     .config("spark.rdd.compress", "true") '

## Download Data from the website in to docker container
https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
#spark.read.csv("Dataset/yellow-2019-01.csv")

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string]

## Converting CSV to Initial Parquet

In [None]:
# reads directory, filters for csv's and feeds into loop to convert to parquet
files=[re.search(r"(.*)(\.csv)$", file).group(1) for file in os.listdir("./Dataset/") if file.endswith(".csv")]
for file in files:
    inpath = f"./Dataset/{file}.csv"
    readdf = spark.read.csv(inpath, header = "true")
    outpath = f"./Dataset/prq/{file}.parquet"
    readdf.write.parquet(outpath)

In [None]:
colours=['yellow']

## Combining Dirty Data with color of taxi

In [None]:
# combines month data into single file per colour
for colour in colours:
    # uses a sample of the first dataset to create and empty df with correct format to join to
    initpath = f"./Dataset/prq/{colour}-2019-01.parquet"
    outdf = spark.read.parquet(initpath)
    outdf = outdf.limit(0)
    # get files for loop
    files = [re.search(r"(.*)(\.parquet)$", file).group(1) for file in os.listdir("./Dataset/prq") if file.endswith(".parquet") and file.startswith(colour)]
    for file in files:
        inpath = f"./Dataset/prq/{file}.parquet"
        readdf = spark.read.parquet(inpath)
        # !! unionByName !! ensures columns match union method can result in incorrect mapping
        outdf = outdf.unionByName(readdf)
    outpath = f"./data/{colour}-all.parquet"
    outdf.write.parquet(outpath)

In [None]:
# read and check columns
yellowdf = spark.read.parquet("./data/yellow-all.parquet")
yellowdf.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge']

In [None]:
# pass data to next stage
data = yellowdf

## Uncleaned Full Dataset

In [None]:
# check record numbers match
data.count()

7667792

In [None]:
# transform dataframe 
# add new missing columns with releveant value
yellowdf = yellowdf.withColumn('trip_type', lit("1"))
yellowdf = yellowdf.withColumn('ehail_fee', lit("0"))
# create colour variable to track dataset
yellowdf = yellowdf.withColumn('colour', lit("yellow"))
yellowdf = yellowdf.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime")
yellowdf = yellowdf.withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")

In [None]:
# pass data to next stage
data = yellowdf

In [None]:
# intial schema not imputed as no cleaning done
data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- trip_type: string (nullable = false)
 |-- ehail_fee: string (nullable = false)
 |-- colour: string (nullable = false)



In [None]:
# create view for spark.sql queries
data.createOrReplaceTempView("data_init_view")

## Grouping by Colour

In [None]:
# SQL query, group by relevent variable and create count to check splits
spark.sql("""
            SELECT colour, count(colour)
            FROM data_init_view
            GROUP by colour
        """).show()

+------+-------------+
|colour|count(colour)|
+------+-------------+
|yellow|      7667792|
+------+-------------+



## VendorID
Should be 1 or 2
   - 1-Creative Mobile Technologies
   - 2-Verifone INC.
    
- VendorId=4 contains 230,613 records?
- Ratecodes include 99 for VendorId=4 which is invalid it should be in range of 1-6

In [None]:
spark.sql("""
            SELECT VendorID, count(VendorID)
            FROM data_init_view
            GROUP by VendorID
        """).show()

+--------+---------------+
|VendorID|count(VendorID)|
+--------+---------------+
|       1|        2938778|
|       4|          76823|
|       2|        4652191|
+--------+---------------+



In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE VendorID == 4
        """).show()

+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|       4|2019-01-25 17:00:59|2019-01-25 17:04:53|              1|          .58|         1|                 N|         237|         262|           2|  

In [None]:
spark.sql("""
            SELECT colour, count(colour)
            FROM data_init_view
            WHERE VendorID == 4
            GROUP by colour
        """).show()

+------+-------------+
|colour|count(colour)|
+------+-------------+
|yellow|        76823|
+------+-------------+



In [None]:
spark.sql("""
            SELECT RatecodeID, count(RatecodeID)
            FROM data_init_view
            WHERE VendorID == 4
            GROUP by RatecodeID
        """).show()

+----------+-----------------+
|RatecodeID|count(RatecodeID)|
+----------+-----------------+
|         3|               78|
|        99|                5|
|         5|              225|
|         1|            75240|
|         4|               46|
|         2|             1229|
+----------+-----------------+



In [None]:
spark.sql("""
            SELECT passenger_count, count(passenger_count)
            FROM data_init_view
            WHERE VendorID == 4
            GROUP by passenger_count
        """).show()

+---------------+----------------------+
|passenger_count|count(passenger_count)|
+---------------+----------------------+
|              3|                   266|
|              5|                     9|
|              1|                 75449|
|              4|                   107|
|              2|                   992|
+---------------+----------------------+



In [None]:
spark.sql("""
            CREATE OR REPLACE TEMPORARY VIEW vid4months
            AS
            SELECT pickup_datetime,
                CASE
                    WHEN pickup_datetime LIKE '%2019-01%' THEN 'jan'
                    ELSE "unknown"
                END AS MonthGroup
            FROM data_init_view
            WHERE VendorID == 4
        """)

spark.sql("""
            SELECT MonthGroup, count(MonthGroup) as count
            FROM vid4months
            GROUP BY MonthGroup
            ORDER BY count
        """).show()


+----------+-----+
|MonthGroup|count|
+----------+-----+
|       jan|76823|
+----------+-----+



## Passenger Count
- 657,274 records of passenger count 0
- For 7-9 passenger count
    - ~350 records( What are they Maxi Type?)

In [None]:
spark.sql("""
            SELECT passenger_count, count(passenger_count)
            FROM data_init_view
            GROUP by passenger_count
        """).show()

+---------------+----------------------+
|passenger_count|count(passenger_count)|
+---------------+----------------------+
|              7|                    19|
|              3|                314721|
|              8|                    29|
|              0|                117381|
|              5|                323842|
|              6|                200811|
|              9|                     9|
|              1|               5456121|
|              4|                140753|
|              2|               1114106|
+---------------+----------------------+



## RatecodeID should be in range(1-6)
- 1271 with RatecodeID=99
- 693 with distance 0
- 394 PULocation 264

In [None]:
spark.sql("""
            SELECT RatecodeID, count(RatecodeID)
            FROM data_init_view
            GROUP by RatecodeID
        """).show()

+----------+-----------------+
|RatecodeID|count(RatecodeID)|
+----------+-----------------+
|         3|            11801|
|        99|              252|
|         5|            54569|
|         6|               46|
|         1|          7430139|
|         4|             4895|
|         2|           166090|
+----------+-----------------+



In [None]:
spark.sql("""
            SELECT colour, count(colour) as count
            FROM data_init_view
            WHERE NOT RatecodeID BETWEEN 1 AND 6
            GROUP BY colour
        """).show()

+------+-----+
|colour|count|
+------+-----+
|yellow|  252|
+------+-----+



In [None]:
spark.sql("""
            SELECT trip_distance, count(trip_distance) AS count
            FROM data_init_view
            WHERE NOT RatecodeID BETWEEN 1 AND 6
            GROUP by trip_distance
            ORDER BY count DESC
        """).show()

+-------------+-----+
|trip_distance|count|
+-------------+-----+
|          .00|  145|
|          .84|    4|
|         1.27|    4|
|         1.35|    3|
|          .66|    2|
|          .72|    2|
|          .74|    2|
|         1.50|    2|
|          .94|    2|
|          .73|    2|
|         3.84|    2|
|         2.57|    2|
|         2.19|    2|
|         1.18|    2|
|         1.52|    2|
|         2.78|    1|
|         1.30|    1|
|        23.89|    1|
|         1.68|    1|
|         1.74|    1|
+-------------+-----+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT PULocationID, count(PULocationID) AS count
            FROM data_init_view
            WHERE NOT RatecodeID BETWEEN 1 AND 6
            GROUP by PULocationID
            ORDER BY count DESC
        """).show()

+------------+-----+
|PULocationID|count|
+------------+-----+
|         264|   84|
|         265|   16|
|         142|    8|
|         239|    8|
|         170|    7|
|          43|    7|
|          79|    7|
|         231|    6|
|         162|    6|
|         230|    6|
|         138|    5|
|         193|    5|
|         132|    5|
|         107|    5|
|         161|    5|
|         234|    4|
|         141|    4|
|         237|    4|
|         145|    4|
|         238|    3|
+------------+-----+
only showing top 20 rows



## Payment Type should be in range(1-6)
- All valid within range
- No 6 
- Count descends as the payment type increases

In [None]:
spark.sql("""
            SELECT payment_type, count(payment_type) AS count
            FROM data_init_view
            GROUP by payment_type
            ORDER BY count DESC
        """).show()

+------------+-------+
|payment_type|  count|
+------------+-------+
|           1|5486027|
|           2|2137415|
|           3|  33186|
|           4|  11164|
+------------+-------+



## Extra, it should be 0.5 or 1
- If all the values is valid then we can change to two bools[]
- Out of range value including negatives, Overnight Charges?
- 101 unique values 
- 19 negative values
- Valid values
    - 4598696 records contains 1
    - 7902055 records contains 0.2

In [None]:
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM data_init_view
            GROUP by extra
            ORDER BY count DESC
        """).show()

+-----+-------+
|extra|  count|
+-----+-------+
|    0|4199855|
|  0.5|2116494|
|    1|1316580|
|  4.5|  31241|
| -0.5|   2201|
|   -1|    863|
|  0.8|    229|
| -4.5|     79|
|  1.3|     74|
| 17.5|     63|
|  1.8|     34|
|  2.5|     21|
|  0.3|     10|
|   18|      9|
|    3|      7|
| 18.5|      6|
|  5.3|      4|
|  0.2|      3|
| 0.25|      1|
| 10.9|      1|
+-----+-------+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM data_init_view
            GROUP by extra
            ORDER BY count DESC
        """).count()

37

In [None]:
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM data_init_view
            WHERE extra < 0
            GROUP by extra
            ORDER BY count DESC
        """).count()

7

## Mta_Tax should be 0.5
- If all values are valid change to bool[]
- 37262299 valid values
- 203257: 0 values
- 52388: -0.5 value(Refund?)
    - Check other out of range and negative values

In [None]:
spark.sql("""
            SELECT mta_tax, count(mta_tax) AS count
            FROM data_init_view
            GROUP by mta_tax
            ORDER BY count DESC
        """).show()

+-------+-------+
|mta_tax|  count|
+-------+-------+
|    0.5|7625883|
|      0|  34984|
|   -0.5|   6819|
|   0.25|     97|
|   0.35|      2|
|  32.53|      1|
|  37.51|      1|
|    0.9|      1|
|   2.42|      1|
|   60.8|      1|
|      1|      1|
|   18.3|      1|
+-------+-------+



### Out of range value
- 52556: Out of range value

In [None]:
spark.sql("""
            SELECT colour, count(colour) AS count
            FROM data_init_view
            WHERE mta_tax != "0"
            AND mta_tax != "0.5"
            GROUP by colour
            ORDER BY count DESC
        """).show()

+------+-----+
|colour|count|
+------+-----+
|yellow| 6925|
+------+-----+



## Improvement_Surcharge should be 0.3
- If all values are valid change to bool[]
- 37447079 valid 0.3 values
- 53940: -0.3 value(Refund?)
- 17076: 0 value
- 16: 1 value
    - All 0 trip_diatance
    - All PU/Do Id= 265

In [None]:
spark.sql("""
            SELECT improvement_surcharge, count(improvement_surcharge) AS count
            FROM data_init_view
            GROUP by improvement_surcharge
            ORDER BY count DESC
        """).show()

+---------------------+-------+
|improvement_surcharge|  count|
+---------------------+-------+
|                  0.3|7658005|
|                 -0.3|   7129|
|                    0|   2657|
|                  0.6|      1|
+---------------------+-------+



In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE improvement_surcharge = "1"
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

## Trip Type should be 1 for Yellow Taxi
- All are valid

In [None]:
spark.sql("""
            SELECT trip_type, count(trip_type) AS count
            FROM data_init_view
            GROUP by trip_type
            ORDER BY count DESC
        """).show()

+---------+-------+
|trip_type|  count|
+---------+-------+
|        1|7667792|
+---------+-------+



## Check Location Value
(Should be an integer from 1-265)- from Taxizone lookup Table:-
    https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
- PULocationID
- DULocationID
- Both
    - No non integer values 
    - No null

In [None]:
# look for non integer values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE PULocationID BETWEEN 1 AND 265
            and mod(PULocationID, 1) != 0
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

In [None]:
# count of location values, most first
spark.sql("""
            SELECT PULocationID, count(PULocationID) AS count
            FROM data_init_view
            GROUP by PULocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|PULocationID| count|
+------------+------+
|         237|332473|
|         236|323008|
|         161|312392|
|         162|277166|
|         230|263646|
|         186|260712|
|          48|240903|
|         170|238978|
|         234|237648|
|         142|235144|
|         239|207883|
|         163|199682|
|         132|196612|
|          79|193955|
|         141|192380|
|         138|184334|
|         107|176786|
|         164|172647|
|          68|171971|
|         238|162192|
+------------+------+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE NOT DOLocationID BETWEEN 1 AND 265
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

In [None]:
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         236|334323|
|         237|296185|
|         161|293782|
|         170|242037|
|         162|232451|
|         230|225336|
|         142|214164|
|          48|208624|
|         234|204386|
|         239|204350|
|         141|202184|
|         186|189486|
|         163|175754|
|         238|175310|
|          79|168608|
|          68|167144|
|         107|162697|
|         263|158297|
|         164|154200|
|         140|152042|
+------------+------+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            WHERE DOLocationID >= 264
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         264|149094|
|         265| 16817|
+------------+------+



## Check dates are within range:-
(Should be 2019-01-01 00:00:00 to 2019-05-31 23:59:59)
- Pickup_datetime
    - min 2001-01-01 00:09:39
        -
    - Max 2088-01-24 00:25:39
        -
- Dropoff_datetime
    - min 2001-01-01 06:39:54
        -
    - Max 2088-01-24 07:28:25
        -

In [None]:
spark.sql("""
            SELECT MIN(pickup_datetime), MAX(pickup_datetime), MIN(dropoff_datetime),  MAX(dropoff_datetime)
            FROM data_init_view
        """).show()

+--------------------+--------------------+---------------------+---------------------+
|min(pickup_datetime)|max(pickup_datetime)|min(dropoff_datetime)|max(dropoff_datetime)|
+--------------------+--------------------+---------------------+---------------------+
| 2001-02-02 14:55:07| 2088-01-24 00:25:39|  2001-02-02 15:07:27|  2088-01-24 07:28:25|
+--------------------+--------------------+---------------------+---------------------+



In [None]:
# calculate tripdays using datediff (simpler)
spark.sql("""
            WITH tripdaysTable AS (
            SELECT *, datediff(dropoff_datetime, pickup_datetime) as tripdays
            FROM data_init_view
            )
            SELECT tripdays, count(tripdays) AS count
            FROM tripdaysTable
            GROUP by tripdays
            ORDER BY count DESC
        """).show()

+--------+-------+
|tripdays|  count|
+--------+-------+
|       0|7597276|
|       1|  70508|
|     -58|      1|
|      -2|      1|
|     -19|      1|
|      30|      1|
|      22|      1|
|      24|      1|
|       5|      1|
|       2|      1|
+--------+-------+



In [None]:
spark.sql("""
            WITH countsTable AS (
                WITH tripdaysTable AS (
                    SELECT *, datediff(dropoff_datetime, pickup_datetime) as tripdays
                    FROM data_init_view
                    )
                SELECT tripdays, count(tripdays) AS count
                FROM tripdaysTable
                GROUP by tripdays
                ORDER BY count DESC
            )
            SELECT sum(count)
            FROM countsTable
            WHERE tripdays != "0"
            AND tripdays != "1"
        """).show()

+----------+
|sum(count)|
+----------+
|         8|
+----------+



In [None]:
# inspect out of range values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE pickup_datetime < "2019-01-01 00:00:00"
            ORDER BY pickup_datetime DESC
        """).show()

+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|       2|2018-12-31 23:59:58|2019-01-01 00:03:52|              1|          .66|         1|                 N|         162|         170|           2|  

In [None]:
# count high out of range values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE pickup_datetime > "2019-12-31 23:59:59" 
        """).count()

2

In [None]:
# count low out of range values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE pickup_datetime < "2019-01-01 00:00:00"
        """).count()

441

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime < "2019-01-01 00:00:00"
        """).count()

316

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime > "2019-01-31 23:59:59" 
            ORDER BY dropoff_datetime
        """).count()

3011

In [None]:
# investigate nye values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime > "2019-01-31 23:59:59"
            AND pickup_datetime < "2019-01-31 23:59:59" 
            ORDER BY dropoff_datetime
        """).count()

2915

In [None]:
# show a candiate value to evaluate
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime > "2020-01-01 23:59:59"
            AND pickup_datetime < "2019-12-31 23:59:59" 
            ORDER BY dropoff_datetime
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

In [None]:
# count where pickup location was unknown
spark.sql("""
            SELECT PULocationID, count(PULocationID) AS count
            FROM data_init_view
            WHERE PULocationID = "264"
            OR PULocationID = "265"
            GROUP by PULocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|PULocationID| count|
+------------+------+
|         264|159760|
|         265|  3871|
+------------+------+



In [None]:
# and for dropoff
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            WHERE DOLocationID = "264"
            OR DOLocationID = "265"
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         264|149094|
|         265| 16817|
+------------+------+



In [None]:
# count where pu location was 264 unknown as was dropoff
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            WHERE PULocationID = "264"
            AND DOLocationID = "264"
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         264|138614|
+------------+------+



In [None]:
# where pickup was other unknown but dropoff was not
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE PULocationID = "265"
            AND DOLocationID != "264"
            AND DOLocationID != "265"
        """).count()

811

In [None]:
# where dropoff was  unknown but pickup was not
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE DOLocationID = "265"
            AND PULocationID != "264"
            AND PULocationID != "265"
        """).count()

13316

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE DOLocationID = "264"
            AND PULocationID != "264"
            AND PULocationID != "265"
        """).count()

10454

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE DOLocationID = "265"
            AND PULocationID = "265"
        """).count()

3034

In [None]:
data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- trip_type: string (nullable = false)
 |-- ehail_fee: string (nullable = false)
 |-- colour: string (nullable = false)



In [None]:
spark.catalog.dropTempView("data_init_view")

## Remove duplicate rows

In [None]:
dataunique = data.distinct()

In [None]:
dataU = dataunique
dataU.count()

7667792

## Initial Cleaning
Duplicates?
    - Run Distinct
### Outcomes from Initial EDA
1. Drop ehail_fee
2. Drop ratecode=99
3. Convert store_and_fwd_flag to bool
4. Drop bad dates
    - Low pickup_datetime<2019-01-01 00:00:00
    - High pickup_datetime>2019-05-31 23:59:59
    - Low dropoff_datetime<2019-01-01 00:00:00
    - High dropoff_datetime>2019-05-31 23:59:59
### Datatypes 
  - Initially all Strings 

In [None]:
dataU.createOrReplaceTempView("data_u_view")

In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW clean AS (
                SELECT *
                FROM data_u_view
                WHERE RatecodeID != "99"
                AND trip_type IS NOT NULL
                AND PULocationID != "265"
                AND pickup_datetime BETWEEN "2019-01-01 00:00:00" AND "2019-01-31 23:59:59"
                AND dropoff_datetime BETWEEN "2019-01-01 00:00:00" AND "2019-01-02 23:59:59"
            )
        """)

DataFrame[]

In [None]:
spark.sql("""       
            CREATE OR REPLACE TEMP VIEW clean2 AS (
            SELECT VendorID, 
                pickup_datetime,
                dropoff_datetime,
                passenger_count,
                trip_distance,
                RatecodeID,
                PULocationID,
                DOLocationID,
                payment_type,
                fare_amount,
                extra,
                mta_tax,
                tip_amount,
                tolls_amount,
                improvement_surcharge,
                total_amount,
            CASE WHEN store_and_fwd_flag = "Y" THEN "1"
            ELSE "0"
            END AS store_and_fwd_flag,
            CASE WHEN trip_type = "1" THEN "0"
            ELSE "0"
            END AS dispatched,
            CASE WHEN colour = "yellow" THEN "0"
            ELSE "0"
            END AS colour
            FROM clean
            )
        """)

DataFrame[]

In [None]:
dataC = spark.sql("SELECT * FROM clean2")

In [None]:
dataC.columns

['VendorID',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'store_and_fwd_flag',
 'dispatched',
 'colour']

## Cleck all columns for NA/Null
- NA
- N/A
- NAN
- NULL
- NIL
- " "

In [None]:
# common na strings as list
commonNA = ["NA", "N/A", "NAN", "NIL", "NULL", " "]

# columns from dataset
columns = dataC.columns

# blank output list
anil = []

# write a sql query string that converts any in the common na list in all columns in columns list to null using SQL NULLIF
# first by na string
for nval in commonNA:
    scol = []
    for col in columns:
        # constrcuts string for each column
        nif = f"NULLIF(UPPER({col}), UPPER('{nval}')) AS {col}"
        scol.append(nif)
    scol = ", ".join(scol)
    # puts all null strings into select statement
    scol = f"SELECT {scol} FROM clean2"
    anil.append(scol)

# check output
anil

["SELECT NULLIF(UPPER(VendorID), UPPER('NA')) AS VendorID, NULLIF(UPPER(pickup_datetime), UPPER('NA')) AS pickup_datetime, NULLIF(UPPER(dropoff_datetime), UPPER('NA')) AS dropoff_datetime, NULLIF(UPPER(passenger_count), UPPER('NA')) AS passenger_count, NULLIF(UPPER(trip_distance), UPPER('NA')) AS trip_distance, NULLIF(UPPER(RatecodeID), UPPER('NA')) AS RatecodeID, NULLIF(UPPER(PULocationID), UPPER('NA')) AS PULocationID, NULLIF(UPPER(DOLocationID), UPPER('NA')) AS DOLocationID, NULLIF(UPPER(payment_type), UPPER('NA')) AS payment_type, NULLIF(UPPER(fare_amount), UPPER('NA')) AS fare_amount, NULLIF(UPPER(extra), UPPER('NA')) AS extra, NULLIF(UPPER(mta_tax), UPPER('NA')) AS mta_tax, NULLIF(UPPER(tip_amount), UPPER('NA')) AS tip_amount, NULLIF(UPPER(tolls_amount), UPPER('NA')) AS tolls_amount, NULLIF(UPPER(improvement_surcharge), UPPER('NA')) AS improvement_surcharge, NULLIF(UPPER(total_amount), UPPER('NA')) AS total_amount, NULLIF(UPPER(store_and_fwd_flag), UPPER('NA')) AS store_and_fwd_f

In [None]:
# create a query for each na string, replaceing view each update
for query in anil:
    fq = f"CREATE OR REPLACE TEMP VIEW clean2 AS ({query})"
    spark.sql(fq)

In [None]:
# write to df
dataC = spark.sql("SELECT * FROM clean2")

In [None]:
dataC.columns

['VendorID',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'store_and_fwd_flag',
 'dispatched',
 'colour']

In [None]:
dataC3 = dataC

In [None]:
# dataC3 = spark.read.parquet("./data/clean2.parquet")
dataC3.createOrReplaceTempView("data_c3_view")

## Trip Length

In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW trip_view AS (
            SELECT *, datediff(dropoff_datetime, pickup_datetime) as tripdays
            FROM data_c3_view
            )
        """)

DataFrame[]

In [None]:
tripClean = spark.sql("""
            SELECT *
            FROM trip_view
            WHERE tripdays <= 1
        """)

In [None]:
dataTC = tripClean

In [None]:
#dataTC = dataTC.repartition(180) # increase from 36 due to shuffle spill
dataTC.createOrReplaceTempView("data_TC_view")

In [None]:
# count number of potential refunds
# use absolute to match negative totals with positives
spark.sql("""
            SELECT pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, ABS(total_amount) AS fare, count(*) AS count
            FROM data_TC_view
            GROUP BY pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, fare
            HAVING count > 1
            ORDER BY count DESC
            """).count()

420

In [None]:
# show examples
spark.sql("""
            SELECT pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, ABS(total_amount) AS fare, count(*) AS count
            FROM data_TC_view
            GROUP BY pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, fare
            HAVING count > 1
            ORDER BY count DESC
            """).show()

+-------------------+-------------------+------------+------------+----+-----+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|fare|count|
+-------------------+-------------------+------------+------------+----+-----+
|2019-01-02 12:51:51|2019-01-02 12:52:05|          93|          93| 3.8|    2|
|2019-01-01 18:38:10|2019-01-01 18:39:35|          75|          75|52.8|    2|
|2019-01-02 13:21:02|2019-01-02 13:24:38|         236|         239| 5.3|    2|
|2019-01-02 15:49:13|2019-01-02 15:55:43|         237|         238| 7.8|    2|
|2019-01-01 00:33:02|2019-01-01 00:36:38|         166|         151| 6.3|    2|
|2019-01-01 16:18:12|2019-01-01 16:30:03|         144|         158|10.8|    2|
|2019-01-01 14:48:32|2019-01-01 14:52:57|         209|         261| 5.3|    2|
|2019-01-01 04:50:18|2019-01-01 04:50:25|         264|         235|80.3|    2|
|2019-01-01 18:25:16|2019-01-01 18:25:24|         142|          43|52.8|    2|
|2019-01-02 17:32:52|2019-01-02 17:38:31|         23

In [None]:
# create view redistributing data to reduce later shuffle
spark.sql("""
            CREATE OR REPLACE TEMP VIEW data_TC_redis AS
            SELECT *
            FROM data_TC_view
            DISTRIBUTE BY PULocationID
            SORT BY DOLocationID, pickup_datetime, dropoff_datetime
            """)

DataFrame[]

In [None]:
# create view of duplicates / refunds
spark.sql("""
            CREATE OR REPLACE TEMP VIEW refunds AS
            SELECT pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, ABS(total_amount) AS fare, count(*) AS count
            FROM data_TC_view
            GROUP BY PULocationID, DOLocationID, pickup_datetime, dropoff_datetime, fare
            HAVING count > 1
            DISTRIBUTE BY PULocationID
            SORT BY DOLocationID, pickup_datetime, dropoff_datetime
            """)

DataFrame[]

In [None]:
# create refund flag, when multiple flag is 1
refunds = spark.sql("""
            SELECT data_tc_redis.*, CASE WHEN refunds.count = "2" THEN "1"
                                    ELSE "0"
                                    END AS refunded_flag
            FROM data_tc_redis
            LEFT JOIN refunds
            ON data_tc_redis.PULocationID = refunds.PULocationID
            AND data_tc_redis.DOLocationID = refunds.DOLocationID
            AND data_tc_redis.pickup_datetime = refunds.pickup_datetime
            AND data_tc_redis.dropoff_datetime = refunds.dropoff_datetime
            """)

In [None]:
dataR = refunds

In [None]:
# dataR = spark.read.parquet("./data/refundFlagged.parquet")
dataR.createOrReplaceTempView("data_R_view")

In [None]:
# check number of "refunds"
spark.sql("""
            SELECT refunded_flag, count(refunded_flag)
            FROM data_R_view
            GROUP by refunded_flag
        """).show()

+-------------+--------------------+
|refunded_flag|count(refunded_flag)|
+-------------+--------------------+
|            0|              384221|
|            1|                 840|
+-------------+--------------------+



In [None]:
# inspect examples
spark.sql("""
            SELECT *
            FROM data_R_view
            WHERE refunded_flag = 1
            """).show()

+--------+-------------------+-------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+------------------+----------+------+--------+-------------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|store_and_fwd_flag|dispatched|colour|tripdays|refunded_flag|
+--------+-------------------+-------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+------------------+----------+------+--------+-------------+
|       2|2019-01-02 19:40:58|2019-01-02 19:44:44|              1|          .51|         1|         237|         141|           2|        4.5|    1|    0.5|         0|     

In [None]:
# count where total amount is 0
spark.sql("""
            SELECT *
            FROM data_R_view
            WHERE refunded_flag = 1
            AND NOT total_amount < "0"
            AND NOT total_amount > "0"           
            """).count()

0

In [None]:
# some equal fares so need to investigate
equalfares = spark.sql("""
                        SELECT pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, total_amount, count(*) AS count
                        FROM data_R_view
                        GROUP BY PULocationID, DOLocationID, pickup_datetime, dropoff_datetime, total_amount
                        HAVING count > 1
                        """)
# equalfares.cache()
equalfares.createOrReplaceTempView("dups_view")

In [None]:
equalfares.count()

0

In [None]:
# use left join exlusive to remove furhter duplicates
removedups = spark.sql("""
            SELECT data_R_view.*
            FROM data_R_view
            LEFT JOIN dups_view
            ON data_R_view.PULocationID = dups_view.PULocationID
            AND data_R_view.DOLocationID = dups_view.DOLocationID
            AND data_R_view.pickup_datetime = dups_view.pickup_datetime
            AND data_R_view.dropoff_datetime = dups_view.dropoff_datetime
            WHERE dups_view.PULocationID IS NULL
            AND dups_view.DOLocationID IS NULL
            AND dups_view.pickup_datetime IS NULL
            AND dups_view.dropoff_datetime IS NULL
            """)

In [None]:
datarfd = removedups

In [None]:
datarfd.createOrReplaceTempView("rem_dups_view")

In [None]:
spark.sql("""
            SELECT pickup_datetime, dropoff_datetime, PULocationID, DOLocationID, total_amount, count(*) AS count
            FROM rem_dups_view
            GROUP BY PULocationID, DOLocationID, pickup_datetime, dropoff_datetime, total_amount
            HAVING count > 1
            """).show()

+---------------+----------------+------------+------------+------------+-----+
|pickup_datetime|dropoff_datetime|PULocationID|DOLocationID|total_amount|count|
+---------------+----------------+------------+------------+------------+-----+
+---------------+----------------+------------+------------+------------+-----+



In [None]:
removeRefund = spark.sql("""
            SELECT *
            FROM rem_dups_view
            WHERE NOT (refunded_flag = "1" AND total_amount < "0")
            """)

In [None]:
# check impact
removeRefund.count()

384641

In [None]:
# datarfd = spark.read.parquet("./data/refunds-cleaned.parquet")
datarfd = removeRefund
datarfd.createOrReplaceTempView("ref_clean_view")

In [None]:
# extra
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM ref_clean_view
            GROUP by extra
            ORDER BY count DESC
        """).show()

+-----+------+
|extra| count|
+-----+------+
|    0|204848|
|  0.5|126463|
|    1| 50785|
|  4.5|  2486|
| -0.5|    12|
|  2.5|    11|
|  0.8|    10|
| 17.5|     9|
|    3|     5|
|  1.3|     3|
|  1.8|     2|
| 18.5|     2|
|   -1|     1|
|   18|     1|
|  3.5|     1|
|  5.3|     1|
| -4.5|     1|
+-----+------+



In [None]:
# mta_tax 

spark.sql("""
            SELECT mta_tax, count(mta_tax) AS count
            FROM ref_clean_view
            GROUP by mta_tax
            ORDER BY count DESC
        """).show()

+-------+------+
|mta_tax| count|
+-------+------+
|    0.5|382200|
|      0|  2417|
|   -0.5|    24|
+-------+------+



In [None]:
spark.sql("""
            SELECT improvement_surcharge, count(improvement_surcharge) AS count
            FROM ref_clean_view
            GROUP by improvement_surcharge
            ORDER BY count DESC
        """).show()

+---------------------+------+
|improvement_surcharge| count|
+---------------------+------+
|                  0.3|384455|
|                    0|   160|
|                 -0.3|    26|
+---------------------+------+



In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW rem_one AS
            SELECT *
            FROM ref_clean_view
            WHERE mta_tax = "0.5"
            OR mta_tax = "0"
        """)

DataFrame[]

In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW rem_two AS
            SELECT *
            FROM rem_one
            WHERE improvement_surcharge = "0.3"
            OR improvement_surcharge = "0"
        """)

DataFrame[]

In [None]:
cols = spark.sql("""
            SELECT *
            FROM rem_two
        """)

In [None]:
cols.columns

['VendorID',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'store_and_fwd_flag',
 'dispatched',
 'colour',
 'tripdays',
 'refunded_flag']

In [None]:
cols.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dispatched: string (nullable = true)
 |-- colour: string (nullable = true)
 |-- tripdays: integer (nullable = true)
 |-- refunded_flag: string (nullable = false)



## convert mta_tax and impreovement_surchage to bools

In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW clean_one AS
            SELECT VendorID,
                    pickup_datetime,
                    dropoff_datetime,
                    passenger_count,
                    trip_distance,
                    RatecodeID,
                    PULocationID,
                    DOLocationID,
                    payment_type,
                    fare_amount,
                    extra,
                    tip_amount,
                    tolls_amount,
                    total_amount,
                    store_and_fwd_flag,
                    dispatched,
                    colour,
                    tripdays,
                    refunded_flag,
                    CASE WHEN mta_tax = "0.5" THEN "1"
                    ELSE "0"
                    END AS mta_tax,
                    CASE WHEN improvement_surcharge = "0.3" THEN "1"
                    ELSE "0"
                    END AS improvement_surcharge
            FROM rem_two
        """)

DataFrame[]

In [None]:
# cleaner data enable use of smaller impact datatypes
spark.sql("""
            CREATE OR REPLACE TEMP VIEW clean_two AS
            SELECT TINYINT(VendorID),
                        TIMESTAMP(pickup_datetime),
                        TIMESTAMP(dropoff_datetime),
                        TINYINT(passenger_count),
                        FLOAT(trip_distance),
                        TINYINT(RatecodeID),
                        SMALLINT(PULocationID),
                        SMALLINT(DOLocationID),
                        TINYINT(payment_type),
                        FLOAT(fare_amount),
                        FLOAT(extra),
                        FLOAT(tip_amount),
                        FLOAT(tolls_amount),
                        FLOAT(total_amount),
                        BOOLEAN(store_and_fwd_flag),
                        BOOLEAN(dispatched),
                        BOOLEAN(colour) AS yellow,
                        tripdays,
                        BOOLEAN(refunded_flag),
                        BOOLEAN(mta_tax),
                        BOOLEAN(improvement_surcharge)
            FROM clean_one
        """)

DataFrame[]

In [None]:
out_clean = spark.sql("""
                            SELECT *
                            FROM clean_two
                        """)

In [None]:
# datainitclean = spark.read.parquet("./data/out_init_clean.parquet")
datainitclean = out_clean
datainitclean.createOrReplaceTempView("init_clean_view")

In [None]:
# need to convert floats to decimals for arithmetic

spark.sql("""
            CREATE OR REPLACE TEMP VIEW ic_two_view AS
            SELECT VendorID,
                    pickup_datetime,
                    dropoff_datetime,
                    passenger_count,
                    CAST(trip_distance AS DECIMAL(10,3)),
                    RatecodeID,
                    PULocationID,
                    DOLocationID,
                    payment_type,
                    CAST(fare_amount AS DECIMAL(10,3)),
                    CAST(extra AS DECIMAL(10,3)),
                    CAST(tip_amount AS DECIMAL(10,3)),
                    CAST(tolls_amount AS DECIMAL(10,3)),
                    CAST(total_amount AS DECIMAL(10,3)),
                    store_and_fwd_flag,
                    dispatched,
                    yellow,
                    tripdays,
                    refunded_flag,
                    mta_tax,
                    improvement_surcharge
            FROM init_clean_view
        """)

DataFrame[]

In [None]:
spark.sql("""
            SELECT yellow, count(yellow) AS count
            FROM ic_two_view
            GROUP BY yellow
            ORDER BY count DESC
            """).show()

+------+------+
|yellow| count|
+------+------+
| false|384615|
+------+------+



In [None]:
spark.sql("""
            WITH calc AS (
                WITH conv AS (
                    SELECT fare_amount, extra, tip_amount, tolls_amount, total_amount,
                        CASE WHEN mta_tax = true THEN 0.5 ELSE 0 END AS tax,
                        CASE WHEN improvement_surcharge = true THEN 0.3 ELSE 0 END AS sur
                    FROM ic_two_view
                )
                SELECT CASE WHEN (fare_amount +
                    extra +
                    tax +
                    tip_amount +
                    tolls_amount +
                    sur ) = total_amount THEN 1
                ELSE 0
                END AS totals_equal
                FROM conv
            )
            SELECT totals_equal, count(totals_equal) AS count
            FROM calc
            GROUP BY totals_equal
            ORDER BY count DESC
            """).show()

+------------+------+
|totals_equal| count|
+------------+------+
|           1|383613|
|           0|  1002|
+------------+------+



In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW summed AS
                WITH conv AS (
                    SELECT fare_amount, extra, tip_amount, tolls_amount, total_amount,
                        CASE WHEN mta_tax = true THEN 0.5 ELSE 0 END AS tax,
                        CASE WHEN improvement_surcharge = true THEN 0.3 ELSE 0 END AS sur
                    FROM ic_two_view
                )
                SELECT *, (fare_amount +
                    extra +
                    tax +
                    tip_amount +
                    tolls_amount +
                    sur ) AS sum_costs
                FROM conv
            """)

DataFrame[]

In [None]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW diffs AS
            SELECT *, total_amount - sum_costs AS totals_diff
            FROM summed
            """)

DataFrame[]

In [None]:
# check schema
spark.sql("""
            SELECT *
            FROM diffs
            WHERE totals_diff > "0"
            OR totals_diff < "0"
            """).printSchema()

root
 |-- fare_amount: decimal(10,3) (nullable = true)
 |-- extra: decimal(10,3) (nullable = true)
 |-- tip_amount: decimal(10,3) (nullable = true)
 |-- tolls_amount: decimal(10,3) (nullable = true)
 |-- total_amount: decimal(10,3) (nullable = true)
 |-- tax: decimal(11,1) (nullable = false)
 |-- sur: decimal(11,1) (nullable = false)
 |-- sum_costs: decimal(17,3) (nullable = true)
 |-- totals_diff: decimal(18,3) (nullable = true)



In [None]:
# datainitclean = spark.read.parquet("./data/out_init_clean.parquet")
datainitclean.createOrReplaceTempView("init_clean_view")

In [None]:
# need to convert floats to decimals for arithmetic

spark.sql("""
            CREATE OR REPLACE TEMP VIEW ic_two_view AS
            SELECT VendorID,
                    pickup_datetime,
                    dropoff_datetime,
                    passenger_count,
                    CAST(trip_distance AS DECIMAL(10,3)),
                    RatecodeID,
                    PULocationID,
                    DOLocationID,
                    payment_type,
                    CAST(fare_amount AS DECIMAL(10,3)),
                    CAST(extra AS DECIMAL(10,3)),
                    CAST(tip_amount AS DECIMAL(10,3)),
                    CAST(tolls_amount AS DECIMAL(10,3)),
                    CAST(total_amount AS DECIMAL(10,3)),
                    store_and_fwd_flag,
                    dispatched,
                    yellow,
                    tripdays,
                    refunded_flag,
                    mta_tax,
                    improvement_surcharge
            FROM init_clean_view
        """)

DataFrame[]

In [None]:
# add fee
spark.sql("""
            CREATE OR REPLACE TEMP VIEW conv AS
                    SELECT *,
                        CASE WHEN mta_tax = true THEN 0.5 ELSE 0 END AS mta_tax_fee,
                        CASE WHEN improvement_surcharge = true THEN 0.3 ELSE 0 END AS surchage_fee
                    FROM ic_two_view
            """)

DataFrame[]

In [None]:
# add sum
spark.sql("""
            CREATE OR REPLACE TEMP VIEW summed AS
                SELECT *, (fare_amount +
                    extra +
                    mta_tax_fee +
                    tip_amount +
                    tolls_amount +
                    surchage_fee ) AS sum_costs
                FROM conv
            """)

DataFrame[]

In [None]:
# add diffs
spark.sql("""
            CREATE OR REPLACE TEMP VIEW diffs AS
            SELECT *, total_amount - sum_costs AS totals_diff
            FROM summed
            """)

DataFrame[]

In [None]:
#add unix timestamp
spark.sql("""
            CREATE OR REPLACE TEMP VIEW trip_sec_view AS
            SELECT *, (unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) AS trip_time_sec
            FROM diffs
            """)

DataFrame[]

In [None]:
# check impact of where difference is out of range
spark.sql("""
            SELECT totals_diff
            FROM trip_sec_view
            WHERE NOT (totals_diff = 1.95 OR totals_diff = 0)
            """).count()

116

In [None]:
# relativley low impact, filter as planned

spark.sql("""
            CREATE OR REPLACE TEMP VIEW clean_2_1_view AS
            SELECT *
            FROM trip_sec_view
            WHERE (totals_diff = 1.95 OR totals_diff = 0)
            AND (extra = 0 OR extra = 0.5 OR extra = 1)
            AND (trip_distance > 0 AND trip_distance <= 60)
            AND (trip_time_sec > 0 AND trip_time_sec <= (120*60))
            AND (fare_amount > 0 AND fare_amount <= 260)
            AND PULocationID != 264
            AND PULocationID != 265 
            AND DOLocationID != 264 
            AND DOLocationID != 265
            AND RatecodeID != 2
            AND RatecodeID != 6
            AND (tip_amount >= 0 AND tip_amount <= 40)
            """)

DataFrame[]

In [None]:
# create object from view
cleancontinuous = spark.sql("""
            SELECT *
            FROM clean_2_1_view
            """)

In [None]:
cleancontinuous.printSchema()

root
 |-- VendorID: byte (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: decimal(10,3) (nullable = true)
 |-- RatecodeID: byte (nullable = true)
 |-- PULocationID: short (nullable = true)
 |-- DOLocationID: short (nullable = true)
 |-- payment_type: byte (nullable = true)
 |-- fare_amount: decimal(10,3) (nullable = true)
 |-- extra: decimal(10,3) (nullable = true)
 |-- tip_amount: decimal(10,3) (nullable = true)
 |-- tolls_amount: decimal(10,3) (nullable = true)
 |-- total_amount: decimal(10,3) (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- dispatched: boolean (nullable = true)
 |-- yellow: boolean (nullable = true)
 |-- tripdays: integer (nullable = true)
 |-- refunded_flag: boolean (nullable = true)
 |-- mta_tax: boolean (nullable = true)
 |-- improvement_surcharge: boolean (nullable = true)
 |-- mta_tax_fee: decimal(11,1) 

In [None]:
cleancontinuous.count()

360628

In [None]:
cleancontinuous.createOrReplaceTempView("clean_cont_view")

In [None]:
# create new flags for taxi size, no passneger, short trip and near two error
# calculate time related variables

spark.sql("""
            CREATE OR REPLACE TEMP VIEW eng_1_view AS
            SELECT *,
                BOOLEAN(CASE WHEN passenger_count BETWEEN 7 AND 9 THEN true
                ELSE false END) AS flag_maxi_taxi,
                BOOLEAN(CASE WHEN passenger_count BETWEEN 4 AND 6 THEN true
                ELSE false END) AS flag_lg_taxi,
                BOOLEAN(CASE WHEN passenger_count = 0 THEN true
                ELSE false END) AS flag_no_passenger,
                BOOLEAN(CASE WHEN PUlocationID = DOlocationID THEN true
                ELSE false END) AS flag_short_trip,
                BOOLEAN(CASE WHEN totals_diff = 1.95 THEN true
                ELSE false END) AS flag_near_two_error,
                CAST((trip_time_sec / 60) AS DECIMAL(20, 6)) AS trip_time_min,
                ((trip_time_sec * trip_distance)/2) AS trip_meta,
                DATEDIFF(pickup_datetime, "2017-01-01") AS PU_day_of_data,
                YEAR(pickup_datetime) AS PU_year,
                MONTH(pickup_datetime) AS PU_month_of_year,
                WEEKOFYEAR(pickup_datetime) AS PU_week_of_year,
                DAYOFMONTH(pickup_datetime) AS PU_day_of_month,
                DAYOFWEEK(pickup_datetime) AS PU_day_of_week,
                HOUR(pickup_datetime) AS PU_hour_of_day,
                MINUTE(pickup_datetime) AS PU_min_of_hour
            FROM clean_cont_view
            """)

DataFrame[]

In [None]:
# check structure of time output (0 or 24)
spark.sql("""
            SELECT DISTINCT PU_hour_of_day
            FROM eng_1_view
            ORDER BY PU_hour_of_day
            """).show(26)

+--------------+
|PU_hour_of_day|
+--------------+
|             0|
|             1|
|             2|
|             3|
|             4|
|             5|
|             6|
|             7|
|             8|
|             9|
|            10|
|            11|
|            12|
|            13|
|            14|
|            15|
|            16|
|            17|
|            18|
|            19|
|            20|
|            21|
|            22|
|            23|
+--------------+



In [None]:
# calculate derived time variables
spark.sql("""
            CREATE OR REPLACE TEMP VIEW eng_2_view AS
            SELECT *,
              (trip_time_min / 60) AS trip_time_hour,
              CASE WHEN PU_year = 2019 THEN 0
              ELSE 1 END AS PU_year_of_data,
              (PU_min_of_hour + (PU_hour_of_day * 60)) AS PU_min_of_day
            FROM eng_1_view
            """)

DataFrame[]

In [None]:
# further derived time variables and speed
spark.sql("""
            CREATE OR REPLACE TEMP VIEW eng_3_view AS
            SELECT *,
                (trip_distance / trip_time_hour) AS trip_avg_speed,
                (PU_month_of_year + (PU_year_of_data * 12)) AS PU_month_of_data,
                (PU_week_of_year + (PU_year_of_data * 52)) AS PU_week_of_data,
                (PU_day_of_data - (PU_year_of_data * 365)) AS PU_day_of_year
            FROM eng_2_view
            """)

DataFrame[]

In [None]:
firstVarEng = spark.sql("""
                SELECT *
                FROM eng_3_view
                """)

In [None]:
firstVarEng.printSchema()

root
 |-- VendorID: byte (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: decimal(10,3) (nullable = true)
 |-- RatecodeID: byte (nullable = true)
 |-- PULocationID: short (nullable = true)
 |-- DOLocationID: short (nullable = true)
 |-- payment_type: byte (nullable = true)
 |-- fare_amount: decimal(10,3) (nullable = true)
 |-- extra: decimal(10,3) (nullable = true)
 |-- tip_amount: decimal(10,3) (nullable = true)
 |-- tolls_amount: decimal(10,3) (nullable = true)
 |-- total_amount: decimal(10,3) (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- dispatched: boolean (nullable = true)
 |-- yellow: boolean (nullable = true)
 |-- tripdays: integer (nullable = true)
 |-- refunded_flag: boolean (nullable = true)
 |-- mta_tax: boolean (nullable = true)
 |-- improvement_surcharge: boolean (nullable = true)
 |-- mta_tax_fee: decimal(11,1) 

In [None]:
firstVarEng.createOrReplaceTempView("eng_3_view")

In [None]:
firstVarEngCluster = spark.sql("""
                                SELECT *
                                FROM eng_3_view
                                CLUSTER BY PUlocationID, DOlocationID
                                """)

In [None]:
firstVarEngCluster.createOrReplaceTempView("eng_3_view_cluster")

In [None]:
taxizone = spark.read.csv("Dataset/taxi+_zone_lookup.csv", header = True)
# minimze partitons of small dataset to reduce shuffle (similar to broadcast)
taxizone.repartition(1)

DataFrame[LocationID: string, Borough: string, Zone: string, service_zone: string]

In [None]:
taxizone.createOrReplaceTempView("taxizone")

In [None]:
# sort to reduce shuffle
spark.sql("""
        SELECT *
        FROM taxizone
        WHERE LocationID != 264
        AND LocationID != 265
        SORT BY LocationID
        """)

DataFrame[LocationID: string, Borough: string, Zone: string, service_zone: string]

In [None]:
spark.sql("""
        CREATE OR REPLACE TEMP VIEW PUtaxizone AS
        SELECT LocationID AS PULocationID, Borough AS PUBorough, Zone AS PUZone, service_zone AS PUServiceZone
        FROM taxizone
        SORT BY PULocationID
        """)

DataFrame[]

In [None]:
spark.sql("""
        CREATE OR REPLACE TEMP VIEW DOtaxizone AS
        SELECT LocationID AS DOLocationID, Borough AS DOBorough, Zone AS DOZone, service_zone AS DOServiceZone
        FROM taxizone
        SORT BY DOLocationID
        """)

DataFrame[]

In [None]:
spark.sql("""
        CREATE OR REPLACE TEMP VIEW PUjoin AS
        SELECT eng_3_view_cluster.*, PUtaxizone.PUBorough, PUtaxizone.PUZone, PUtaxizone.PUServiceZone
        FROM eng_3_view_cluster
        LEFT JOIN PUtaxizone
        ON eng_3_view_cluster.PULocationID = PUtaxizone.PULocationID
        """)

DataFrame[]

In [None]:
spark.sql("""
        CREATE OR REPLACE TEMP VIEW DOjoin AS
        SELECT PUjoin.*, DOtaxizone.DOBorough, DOtaxizone.DOZone, DOtaxizone.DOServiceZone
        FROM PUjoin
        LEFT JOIN DOtaxizone
        ON PUjoin.DOLocationID = DOtaxizone.DOLocationID
        """)

DataFrame[]

In [None]:
zonejoined = spark.sql("""
                        SELECT *
                        FROM DOjoin
                        """)

In [None]:
zonejoined.printSchema()

root
 |-- VendorID: byte (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: decimal(10,3) (nullable = true)
 |-- RatecodeID: byte (nullable = true)
 |-- PULocationID: short (nullable = true)
 |-- DOLocationID: short (nullable = true)
 |-- payment_type: byte (nullable = true)
 |-- fare_amount: decimal(10,3) (nullable = true)
 |-- extra: decimal(10,3) (nullable = true)
 |-- tip_amount: decimal(10,3) (nullable = true)
 |-- tolls_amount: decimal(10,3) (nullable = true)
 |-- total_amount: decimal(10,3) (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- dispatched: boolean (nullable = true)
 |-- yellow: boolean (nullable = true)
 |-- tripdays: integer (nullable = true)
 |-- refunded_flag: boolean (nullable = true)
 |-- mta_tax: boolean (nullable = true)
 |-- improvement_surcharge: boolean (nullable = true)
 |-- mta_tax_fee: decimal(11,1) 

In [None]:
zonejoined = zonejoined.drop('trip_avg_speed')
zonejoined.createOrReplaceTempView("zonejoined_view")

In [None]:
# recast speed
spark.sql("""
        CREATE OR REPLACE TEMP VIEW checknew_view AS
        SELECT *, CAST(ROUND((trip_distance / trip_time_hour), 2) AS DECIMAL(10,2)) AS trip_avg_speed
        FROM zonejoined_view
        """)

DataFrame[]

In [None]:
spark.sql("""
        SELECT trip_avg_speed
        FROM checknew_view
        """).summary().show()

+-------+----------------+
|summary|  trip_avg_speed|
+-------+----------------+
|  count|          360628|
|   mean|       14.783688|
| stddev|125.787304080096|
|    min|            0.02|
|    25%|            9.06|
|    50%|           11.75|
|    75%|           15.75|
|    max|        32040.32|
+-------+----------------+



In [None]:
checkmeta = spark.sql("""
                        SELECT *
                        FROM checknew_view
                        """).drop("trip_meta")

In [None]:
checkmeta.createOrReplaceTempView("checkmeta_view")

In [None]:
spark.sql("""
        CREATE OR REPLACE TEMP VIEW tripmeta AS
        SELECT *,
        ((trip_time_min * trip_distance)/2) AS trip_meta
        FROM checkmeta_view
        """)

In [None]:
tripmeta = spark.sql("""
                    SELECT trip_meta, fare_amount, pickup_datetime, PU_week_of_data, RatecodeID, PUBorough, yellow
                    FROM tripmeta
                    """).sample(fraction = 0.001).toPandas()

In [None]:
featureout = spark.sql("""
                    SELECT *
                    FROM tripmeta
                    WHERE fare_amount >= 2.5
                    AND trip_avg_speed > 0
                    AND trip_avg_speed <= 60
                    CLUSTER BY yellow, RatecodeID, PU_year_of_data, PU_month_of_data, pickup_datetime
                    """)

In [None]:
featureout.write.partitionBy("yellow","RatecodeID","PU_year_of_data", "PU_month_of_data").parquet("./data/featureout.parquet")

In [None]:
featureout = spark.read.parquet("./data/featureout.parquet")

In [None]:
featureout.printSchema()

root
 |-- VendorID: byte (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: byte (nullable = true)
 |-- trip_distance: decimal(10,3) (nullable = true)
 |-- PULocationID: short (nullable = true)
 |-- DOLocationID: short (nullable = true)
 |-- payment_type: byte (nullable = true)
 |-- fare_amount: decimal(10,3) (nullable = true)
 |-- extra: decimal(10,3) (nullable = true)
 |-- tip_amount: decimal(10,3) (nullable = true)
 |-- tolls_amount: decimal(10,3) (nullable = true)
 |-- total_amount: decimal(10,3) (nullable = true)
 |-- store_and_fwd_flag: boolean (nullable = true)
 |-- dispatched: boolean (nullable = true)
 |-- tripdays: integer (nullable = true)
 |-- refunded_flag: boolean (nullable = true)
 |-- mta_tax: boolean (nullable = true)
 |-- improvement_surcharge: boolean (nullable = true)
 |-- mta_tax_fee: decimal(11,1) (nullable = true)
 |-- surchage_fee: decimal(11,1) (nullable = true)
 |-- sum_c