# New York Taxi Data Analysis 
Group-9

## Setup

In [None]:
# generic modules
import itertools
import os
import re
import timeit
import gc

# specific module
#import wget

# common ds modules
import pandas as pd
#import plotly.express as px

# spark modules for session managment
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

# spark functions
from pyspark.sql.functions import lit
import pyspark.sql.functions as sparkle

# spark types
from pyspark.sql.types import *

# spark ml
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# session starter named nyctaxi
spark=SparkSession.builder \
    .appName('nyctaxi') \
    .master('local[*]') \
    .config('spark.driver.memory','10G') \
    .getOrCreate()


#     .config("spark.sql.default.parallelism", "360") \ 
'''
.config("spark.driver.maxResultSize", "8g") \
    
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "150000") \
    .config("spark.sql.tungsten.enabled", "true") \
    .config("spark.sql.shuffle.partitions", "360") \
    .config("spark.rdd.compress", "true") \
'''

'\n.config("spark.driver.maxResultSize", "8g")     \n    .config("spark.sql.execution.arrow.pyspark.enabled", "true")     .config("spark.sql.execution.arrow.maxRecordsPerBatch", "150000")     .config("spark.sql.tungsten.enabled", "true")     .config("spark.sql.shuffle.partitions", "360")     .config("spark.rdd.compress", "true") '

## Download Data from the website in to docker container
https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [None]:
#spark.read.csv("Dataset/yellow-2019-01.csv")

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string, _c14: string, _c15: string, _c16: string, _c17: string]

## Converting CSV to Initial Parquet

In [None]:
# reads directory, filters for csv's and feeds into loop to convert to parquet
files=[re.search(r"(.*)(\.csv)$", file).group(1) for file in os.listdir("./Dataset/") if file.endswith(".csv")]
for file in files:
    inpath = f"./Dataset/{file}.csv"
    readdf = spark.read.csv(inpath, header = "true")
    outpath = f"./Dataset/prq/{file}.parquet"
    readdf.write.parquet(outpath)

In [None]:
colours=['yellow']

## Combining Dirty Data with color of taxi

In [None]:
# combines month data into single file per colour
for colour in colours:
    # uses a sample of the first dataset to create and empty df with correct format to join to
    initpath = f"./Dataset/prq/{colour}-2019-01.parquet"
    outdf = spark.read.parquet(initpath)
    outdf = outdf.limit(0)
    # get files for loop
    files = [re.search(r"(.*)(\.parquet)$", file).group(1) for file in os.listdir("./Dataset/prq") if file.endswith(".parquet") and file.startswith(colour)]
    for file in files:
        inpath = f"./Dataset/prq/{file}.parquet"
        readdf = spark.read.parquet(inpath)
        # !! unionByName !! ensures columns match union method can result in incorrect mapping
        outdf = outdf.unionByName(readdf)
    outpath = f"./data/{colour}-all.parquet"
    outdf.write.parquet(outpath)

In [None]:
# read and check columns
yellowdf = spark.read.parquet("./data/yellow-all.parquet")
yellowdf.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge']

In [None]:
# pass data to next stage
data = yellowdf

## Uncleaned Full Dataset

In [None]:
# check record numbers match
data.count()

7667792

In [None]:
# transform dataframe 
# add new missing columns with releveant value
yellowdf = yellowdf.withColumn('trip_type', lit("1"))
yellowdf = yellowdf.withColumn('ehail_fee', lit("0"))
# create colour variable to track dataset
yellowdf = yellowdf.withColumn('colour', lit("yellow"))
yellowdf = yellowdf.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime")
yellowdf = yellowdf.withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")

In [None]:
# pass data to next stage
data = yellowdf

In [None]:
# intial schema not imputed as no cleaning done
data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- trip_type: string (nullable = false)
 |-- ehail_fee: string (nullable = false)
 |-- colour: string (nullable = false)



In [None]:
# create view for spark.sql queries
data.createOrReplaceTempView("data_init_view")

## Grouping by Colour

In [None]:
# SQL query, group by relevent variable and create count to check splits
spark.sql("""
            SELECT colour, count(colour)
            FROM data_init_view
            GROUP by colour
        """).show()

+------+-------------+
|colour|count(colour)|
+------+-------------+
|yellow|      7667792|
+------+-------------+



## VendorID
Should be 1 or 2
   - 1-Creative Mobile Technologies
   - 2-Verifone INC.
    
- VendorId=4 contains 230,613 records?
- Ratecodes include 99 for VendorId=4 which is invalid it should be in range of 1-6

In [None]:
spark.sql("""
            SELECT VendorID, count(VendorID)
            FROM data_init_view
            GROUP by VendorID
        """).show()

+--------+---------------+
|VendorID|count(VendorID)|
+--------+---------------+
|       1|        2938778|
|       4|          76823|
|       2|        4652191|
+--------+---------------+



In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE VendorID == 4
        """).show()

+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|       4|2019-01-25 17:00:59|2019-01-25 17:04:53|              1|          .58|         1|                 N|         237|         262|           2|  

In [None]:
spark.sql("""
            SELECT colour, count(colour)
            FROM data_init_view
            WHERE VendorID == 4
            GROUP by colour
        """).show()

+------+-------------+
|colour|count(colour)|
+------+-------------+
|yellow|        76823|
+------+-------------+



In [None]:
spark.sql("""
            SELECT RatecodeID, count(RatecodeID)
            FROM data_init_view
            WHERE VendorID == 4
            GROUP by RatecodeID
        """).show()

+----------+-----------------+
|RatecodeID|count(RatecodeID)|
+----------+-----------------+
|         3|               78|
|        99|                5|
|         5|              225|
|         1|            75240|
|         4|               46|
|         2|             1229|
+----------+-----------------+



In [None]:
spark.sql("""
            SELECT passenger_count, count(passenger_count)
            FROM data_init_view
            WHERE VendorID == 4
            GROUP by passenger_count
        """).show()

+---------------+----------------------+
|passenger_count|count(passenger_count)|
+---------------+----------------------+
|              3|                   266|
|              5|                     9|
|              1|                 75449|
|              4|                   107|
|              2|                   992|
+---------------+----------------------+



In [None]:
spark.sql("""
            CREATE OR REPLACE TEMPORARY VIEW vid4months
            AS
            SELECT pickup_datetime,
                CASE
                    WHEN pickup_datetime LIKE '%2019-01%' THEN 'jan'
                    ELSE "unknown"
                END AS MonthGroup
            FROM data_init_view
            WHERE VendorID == 4
        """)

spark.sql("""
            SELECT MonthGroup, count(MonthGroup) as count
            FROM vid4months
            GROUP BY MonthGroup
            ORDER BY count
        """).show()


+----------+-----+
|MonthGroup|count|
+----------+-----+
|       jan|76823|
+----------+-----+



## Passenger Count
- 657,274 records of passenger count 0
- For 7-9 passenger count
    - ~350 records( What are they Maxi Type?)

In [None]:
spark.sql("""
            SELECT passenger_count, count(passenger_count)
            FROM data_init_view
            GROUP by passenger_count
        """).show()

+---------------+----------------------+
|passenger_count|count(passenger_count)|
+---------------+----------------------+
|              7|                    19|
|              3|                314721|
|              8|                    29|
|              0|                117381|
|              5|                323842|
|              6|                200811|
|              9|                     9|
|              1|               5456121|
|              4|                140753|
|              2|               1114106|
+---------------+----------------------+



## RatecodeID should be in range(1-6)
- 1271 with RatecodeID=99
- 693 with distance 0
- 394 PULocation 264

In [None]:
spark.sql("""
            SELECT RatecodeID, count(RatecodeID)
            FROM data_init_view
            GROUP by RatecodeID
        """).show()

+----------+-----------------+
|RatecodeID|count(RatecodeID)|
+----------+-----------------+
|         3|            11801|
|        99|              252|
|         5|            54569|
|         6|               46|
|         1|          7430139|
|         4|             4895|
|         2|           166090|
+----------+-----------------+



In [None]:
spark.sql("""
            SELECT colour, count(colour) as count
            FROM data_init_view
            WHERE NOT RatecodeID BETWEEN 1 AND 6
            GROUP BY colour
        """).show()

+------+-----+
|colour|count|
+------+-----+
|yellow|  252|
+------+-----+



In [None]:
spark.sql("""
            SELECT trip_distance, count(trip_distance) AS count
            FROM data_init_view
            WHERE NOT RatecodeID BETWEEN 1 AND 6
            GROUP by trip_distance
            ORDER BY count DESC
        """).show()

+-------------+-----+
|trip_distance|count|
+-------------+-----+
|          .00|  145|
|          .84|    4|
|         1.27|    4|
|         1.35|    3|
|          .66|    2|
|          .72|    2|
|          .74|    2|
|         1.50|    2|
|          .94|    2|
|          .73|    2|
|         3.84|    2|
|         2.57|    2|
|         2.19|    2|
|         1.18|    2|
|         1.52|    2|
|         2.78|    1|
|         1.30|    1|
|        23.89|    1|
|         1.68|    1|
|         1.74|    1|
+-------------+-----+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT PULocationID, count(PULocationID) AS count
            FROM data_init_view
            WHERE NOT RatecodeID BETWEEN 1 AND 6
            GROUP by PULocationID
            ORDER BY count DESC
        """).show()

+------------+-----+
|PULocationID|count|
+------------+-----+
|         264|   84|
|         265|   16|
|         142|    8|
|         239|    8|
|         170|    7|
|          43|    7|
|          79|    7|
|         231|    6|
|         162|    6|
|         230|    6|
|         138|    5|
|         193|    5|
|         132|    5|
|         107|    5|
|         161|    5|
|         234|    4|
|         141|    4|
|         237|    4|
|         145|    4|
|         238|    3|
+------------+-----+
only showing top 20 rows



## Payment Type should be in range(1-6)
- All valid within range
- No 6 
- Count descends as the payment type increases

In [None]:
spark.sql("""
            SELECT payment_type, count(payment_type) AS count
            FROM data_init_view
            GROUP by payment_type
            ORDER BY count DESC
        """).show()

+------------+-------+
|payment_type|  count|
+------------+-------+
|           1|5486027|
|           2|2137415|
|           3|  33186|
|           4|  11164|
+------------+-------+



## Extra, it should be 0.5 or 1
- If all the values is valid then we can change to two bools[]
- Out of range value including negatives, Overnight Charges?
- 101 unique values 
- 19 negative values
- Valid values
    - 4598696 records contains 1
    - 7902055 records contains 0.2

In [None]:
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM data_init_view
            GROUP by extra
            ORDER BY count DESC
        """).show()

+-----+-------+
|extra|  count|
+-----+-------+
|    0|4199855|
|  0.5|2116494|
|    1|1316580|
|  4.5|  31241|
| -0.5|   2201|
|   -1|    863|
|  0.8|    229|
| -4.5|     79|
|  1.3|     74|
| 17.5|     63|
|  1.8|     34|
|  2.5|     21|
|  0.3|     10|
|   18|      9|
|    3|      7|
| 18.5|      6|
|  5.3|      4|
|  0.2|      3|
| 0.25|      1|
| 10.9|      1|
+-----+-------+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM data_init_view
            GROUP by extra
            ORDER BY count DESC
        """).count()

37

In [None]:
spark.sql("""
            SELECT extra, count(extra) AS count
            FROM data_init_view
            WHERE extra < 0
            GROUP by extra
            ORDER BY count DESC
        """).count()

7

## Mta_Tax should be 0.5
- If all values are valid change to bool[]
- 37262299 valid values
- 203257: 0 values
- 52388: -0.5 value(Refund?)
    - Check other out of range and negative values

In [None]:
spark.sql("""
            SELECT mta_tax, count(mta_tax) AS count
            FROM data_init_view
            GROUP by mta_tax
            ORDER BY count DESC
        """).show()

+-------+-------+
|mta_tax|  count|
+-------+-------+
|    0.5|7625883|
|      0|  34984|
|   -0.5|   6819|
|   0.25|     97|
|   0.35|      2|
|  32.53|      1|
|  37.51|      1|
|    0.9|      1|
|   2.42|      1|
|   60.8|      1|
|      1|      1|
|   18.3|      1|
+-------+-------+



### Out of range value
- 52556: Out of range value

In [None]:
spark.sql("""
            SELECT colour, count(colour) AS count
            FROM data_init_view
            WHERE mta_tax != "0"
            AND mta_tax != "0.5"
            GROUP by colour
            ORDER BY count DESC
        """).show()

+------+-----+
|colour|count|
+------+-----+
|yellow| 6925|
+------+-----+



## Improvement_Surcharge should be 0.3
- If all values are valid change to bool[]
- 37447079 valid 0.3 values
- 53940: -0.3 value(Refund?)
- 17076: 0 value
- 16: 1 value
    - All 0 trip_diatance
    - All PU/Do Id= 265

In [None]:
spark.sql("""
            SELECT improvement_surcharge, count(improvement_surcharge) AS count
            FROM data_init_view
            GROUP by improvement_surcharge
            ORDER BY count DESC
        """).show()

+---------------------+-------+
|improvement_surcharge|  count|
+---------------------+-------+
|                  0.3|7658005|
|                 -0.3|   7129|
|                    0|   2657|
|                  0.6|      1|
+---------------------+-------+



In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE improvement_surcharge = "1"
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

## Trip Type should be 1 for Yellow Taxi
- All are valid

In [None]:
spark.sql("""
            SELECT trip_type, count(trip_type) AS count
            FROM data_init_view
            GROUP by trip_type
            ORDER BY count DESC
        """).show()

+---------+-------+
|trip_type|  count|
+---------+-------+
|        1|7667792|
+---------+-------+



## Check Location Value
(Should be an integer from 1-265)- from Taxizone lookup Table:-
    https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
- PULocationID
- DULocationID
- Both
    - No non integer values 
    - No null

In [None]:
# look for non integer values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE PULocationID BETWEEN 1 AND 265
            and mod(PULocationID, 1) != 0
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

In [None]:
# count of location values, most first
spark.sql("""
            SELECT PULocationID, count(PULocationID) AS count
            FROM data_init_view
            GROUP by PULocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|PULocationID| count|
+------------+------+
|         237|332473|
|         236|323008|
|         161|312392|
|         162|277166|
|         230|263646|
|         186|260712|
|          48|240903|
|         170|238978|
|         234|237648|
|         142|235144|
|         239|207883|
|         163|199682|
|         132|196612|
|          79|193955|
|         141|192380|
|         138|184334|
|         107|176786|
|         164|172647|
|          68|171971|
|         238|162192|
+------------+------+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE NOT DOLocationID BETWEEN 1 AND 265
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

In [None]:
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         236|334323|
|         237|296185|
|         161|293782|
|         170|242037|
|         162|232451|
|         230|225336|
|         142|214164|
|          48|208624|
|         234|204386|
|         239|204350|
|         141|202184|
|         186|189486|
|         163|175754|
|         238|175310|
|          79|168608|
|          68|167144|
|         107|162697|
|         263|158297|
|         164|154200|
|         140|152042|
+------------+------+
only showing top 20 rows



In [None]:
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            WHERE DOLocationID >= 264
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         264|149094|
|         265| 16817|
+------------+------+



## Check dates are within range:-
(Should be 2019-01-01 00:00:00 to 2019-05-31 23:59:59)
- Pickup_datetime
    - min 2001-01-01 00:09:39
        -
    - Max 2088-01-24 00:25:39
        -
- Dropoff_datetime
    - min 2001-01-01 06:39:54
        -
    - Max 2088-01-24 07:28:25
        -

In [None]:
spark.sql("""
            SELECT MIN(pickup_datetime), MAX(pickup_datetime), MIN(dropoff_datetime),  MAX(dropoff_datetime)
            FROM data_init_view
        """).show()

+--------------------+--------------------+---------------------+---------------------+
|min(pickup_datetime)|max(pickup_datetime)|min(dropoff_datetime)|max(dropoff_datetime)|
+--------------------+--------------------+---------------------+---------------------+
| 2001-02-02 14:55:07| 2088-01-24 00:25:39|  2001-02-02 15:07:27|  2088-01-24 07:28:25|
+--------------------+--------------------+---------------------+---------------------+



In [None]:
# calculate tripdays using datediff (simpler)
spark.sql("""
            WITH tripdaysTable AS (
            SELECT *, datediff(dropoff_datetime, pickup_datetime) as tripdays
            FROM data_init_view
            )
            SELECT tripdays, count(tripdays) AS count
            FROM tripdaysTable
            GROUP by tripdays
            ORDER BY count DESC
        """).show()

+--------+-------+
|tripdays|  count|
+--------+-------+
|       0|7597276|
|       1|  70508|
|     -58|      1|
|      -2|      1|
|     -19|      1|
|      30|      1|
|      22|      1|
|      24|      1|
|       5|      1|
|       2|      1|
+--------+-------+



In [None]:
spark.sql("""
            WITH countsTable AS (
                WITH tripdaysTable AS (
                    SELECT *, datediff(dropoff_datetime, pickup_datetime) as tripdays
                    FROM data_init_view
                    )
                SELECT tripdays, count(tripdays) AS count
                FROM tripdaysTable
                GROUP by tripdays
                ORDER BY count DESC
            )
            SELECT sum(count)
            FROM countsTable
            WHERE tripdays != "0"
            AND tripdays != "1"
        """).show()

+----------+
|sum(count)|
+----------+
|         8|
+----------+



In [None]:
# inspect out of range values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE pickup_datetime < "2019-01-01 00:00:00"
            ORDER BY pickup_datetime DESC
        """).show()

+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|    pickup_datetime|   dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+-------------------+-------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|       2|2018-12-31 23:59:58|2019-01-01 00:03:52|              1|          .66|         1|                 N|         162|         170|           2|  

In [None]:
# count high out of range values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE pickup_datetime > "2019-12-31 23:59:59" 
        """).count()

2

In [None]:
# count low out of range values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE pickup_datetime < "2019-01-01 00:00:00"
        """).count()

441

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime < "2019-01-01 00:00:00"
        """).count()

316

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime > "2019-01-31 23:59:59" 
            ORDER BY dropoff_datetime
        """).count()

3011

In [None]:
# investigate nye values
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime > "2019-01-31 23:59:59"
            AND pickup_datetime < "2019-01-31 23:59:59" 
            ORDER BY dropoff_datetime
        """).count()

2915

In [None]:
# show a candiate value to evaluate
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE dropoff_datetime > "2020-01-01 23:59:59"
            AND pickup_datetime < "2019-12-31 23:59:59" 
            ORDER BY dropoff_datetime
        """).show()

+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
|VendorID|pickup_datetime|dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|trip_type|ehail_fee|colour|
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+---------+---------+------+
+--------+---------------+----------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----

In [None]:
# count where pickup location was unknown
spark.sql("""
            SELECT PULocationID, count(PULocationID) AS count
            FROM data_init_view
            WHERE PULocationID = "264"
            OR PULocationID = "265"
            GROUP by PULocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|PULocationID| count|
+------------+------+
|         264|159760|
|         265|  3871|
+------------+------+



In [None]:
# and for dropoff
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            WHERE DOLocationID = "264"
            OR DOLocationID = "265"
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         264|149094|
|         265| 16817|
+------------+------+



In [None]:
# count where pu location was 264 unknown as was dropoff
spark.sql("""
            SELECT DOLocationID, count(DOLocationID) AS count
            FROM data_init_view
            WHERE PULocationID = "264"
            AND DOLocationID = "264"
            GROUP by DOLocationID
            ORDER BY count DESC
        """).show()

+------------+------+
|DOLocationID| count|
+------------+------+
|         264|138614|
+------------+------+



In [None]:
# where pickup was other unknown but dropoff was not
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE PULocationID = "265"
            AND DOLocationID != "264"
            AND DOLocationID != "265"
        """).count()

811

In [None]:
# where dropoff was  unknown but pickup was not
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE DOLocationID = "265"
            AND PULocationID != "264"
            AND PULocationID != "265"
        """).count()

13316

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE DOLocationID = "264"
            AND PULocationID != "264"
            AND PULocationID != "265"
        """).count()

10454

In [None]:
spark.sql("""
            SELECT *
            FROM data_init_view
            WHERE DOLocationID = "265"
            AND PULocationID = "265"
        """).count()

3034

In [None]:
data.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- trip_type: string (nullable = false)
 |-- ehail_fee: string (nullable = false)
 |-- colour: string (nullable = false)



In [None]:
spark.catalog.dropTempView("data_init_view")