In [1]:
import boto3
import os
import re
from tqdm import tqdm
import sys

In [2]:
os.getcwd()

'/home/jovyan/work/notebooks'

In [3]:
os.chdir('../')

In [4]:
os.listdir()

['.DS_Store',
 '.env',
 '.git',
 '.gitattributes',
 '.ipynb_checkpoints',
 'Dockerfile',
 'README.md',
 'Untitled.ipynb',
 'data',
 'docker-compose.yml',
 'error_logs',
 'hs_err_pid133.log',
 'hs_err_pid277.log',
 'hs_err_pid681.log',
 'notebooks',
 'spark-warehouse',
 'src']

## Download Dataset

First step is to download the dataset from S3 - to do this, I used the boto3 library to access the public S3 data. I will be working with the 2017-18 data in this dataset

In [5]:
s3 = boto3.client('s3')

# Define function to download data and save it

Not sure if I'll need this with Spark - might still be worthwhile having data stored locally

In [6]:
def download_and_save_data(bucket = 'nyc-tlc',prefix = 'trip data',year_regex = '201[78]', data_folder = 'data'):
    """
    This function downloads and saves relevant data from the S3 bucket to my local machine.
    
    params:
    * s3 - a boto3.client('s3') object with relevant permissions
    * bucket - bucket of stored data
    * prefix - folder with relevant stored data
    * year_regex - the years for the project
    * data_folder - where I want to store the data
    """
    s3 = boto3.client('s3')
    print("Starting yellow cabs download")
    sys.stdout.write("[%s]" % (" " * 12))
    sys.stdout.flush()
    sys.stdout.write("\b" * (12+1)) # return to start of line, after '['
    contents = s3.list_objects(Bucket = bucket, Prefix = prefix)['Contents']
    #retrieve yellow cab keys and download file
    yellow_cab_keys = [i['Key'] for i in contents if ('yellow_tripdata' in i['Key'])&(bool(re.search(year_regex,i['Key'])))]
    for i in yellow_cab_keys:
        if (re.sub(".*/","",i) in os.listdir(data_folder+'/yellow_cabs/'))==False:
            s3.download_file('nyc-tlc',i,data_folder+'/yellow_cabs/'+re.sub(".*/","",i)) 
        sys.stdout.write("-")
        sys.stdout.flush()
    sys.stdout.write("]\n")
    print('Yellow Cabs completed')
    print("Starting green cabs download")
    sys.stdout.write("[%s]" % (" " * 12))
    sys.stdout.flush()
    sys.stdout.write("\b" * (12+1)) # return to start of line, after '['
    green_cab_keys = [i['Key'] for i in contents if ('green_tripdata' in i['Key'])&(bool(re.search(year_regex,i['Key'])))]
    for i in green_cab_keys:
        if (re.sub(".*/","",i) in os.listdir(data_folder+'/green_cabs/'))==False:
            s3.download_file('nyc-tlc',i,data_folder+'/green_cabs/'+re.sub(".*/","",i)) 
        sys.stdout.write("-")
        sys.stdout.flush()
    sys.stdout.write("]\n")
    print("Green Cabs completed")
    return "Data downloaded and saved"
        
    

In [7]:
download_and_save_data(bucket = 'nyc-tlc',prefix = 'trip data',year_regex = '201[78]', data_folder = 'data')

Starting yellow cabs download
[            ------------------------]
Yellow Cabs completed
Starting green cabs download
[            ------------------------]
Green Cabs completed


'Data downloaded and saved'

## Create a Spark Session

In [8]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf


In [9]:
spark = SparkSession.builder \
        .appName('assignment_1') \
        .getOrCreate() 

#### Check versions

In [10]:
spark.version

'2.4.5'

In [11]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

'3.0.0'

### Load Data

In [12]:
g = ["data/green_cabs/" + i for i in  os.listdir('data/green_cabs')]
y = ["data/yellow_cabs/" + i for i in  os.listdir('data/yellow_cabs')]
paths = [i for y in [g,y] for i in y]

In [None]:
green = spark.read.format('csv').options(header ='true').load(g)

In [None]:
yellow = spark.read.format('csv').options(header ='true').load(y)

## Feature Engineering

In [None]:
import pyspark.sql.functions as F

Create a variable for the colour of the taxi

In [None]:
from pyspark.sql.functions import lit

green = green.withColumn('taxi_colour', lit('green'))
yellow = yellow.withColumn('taxi_colour', lit('yellow'))

Print the schemas - Note that there is a difference in column names and types. We need to cre

In [None]:
green.printSchema()

In [None]:
yellow.printSchema()

In [None]:
from pyspark.sql.types import DateType, IntegerType, BooleanType, TimestampType, FloatType
green = green.withColumn('pickup_date', F.col('lpep_pickup_datetime').astype(TimestampType())).\
    withColumn("dropoff_date", F.col("lpep_dropoff_datetime").astype(TimestampType()))
yellow = yellow.withColumn('pickup_date', F.col('tpep_pickup_datetime').astype(TimestampType()) ).\
    withColumn("dropoff_date", F.col("tpep_dropoff_datetime").astype(TimestampType()))

In [None]:
yellow.printSchema()

In [None]:
green.printSchema()

In [None]:
green.columns

In [None]:
yellow.columns

In [None]:
cols =['VendorID',
 'store_and_fwd_flag',
 'RatecodeID',
 'PULocationID',
 'DOLocationID',
 'passenger_count',
 'trip_distance',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'payment_type',
 'taxi_colour',
 'pickup_date',
 'dropoff_date']

### Removed columns

Some columns have been removed from the dataset. Namely:
1. ehail_fee:  This is not in the data dictionary, but assume it is the fee for a cab to be 'hailed'. It does not exist in the yellow cabs dataset so therefore it should be excluded
2. Trip_type: only exists in the green taxi dataset - refers to whether a cab was dispatched or hailed
3. date/time stamps: these were transformed into new columns and had names aligned.
4. fair_amount: removed as per assignment brief

I have decided to leave 'ID' fields as strings for now, as they do not represent numbers but rather a categorical variable

In [None]:
green = green.select(cols)
yellow = yellow.select(cols)

In [None]:
dfs = green.union(yellow)

In [None]:
dfs.show(5)

In [None]:
dfs.printSchema()

## Convert numeric to correct datatypes

In [None]:
from pyspark.sql.types import DateType, IntegerType, BooleanType, FloatType
dfs= dfs.withColumn('store_and_fwd_flag', F.col('store_and_fwd_flag').astype(BooleanType())).\
    withColumn('passenger_count', F.col('passenger_count').astype(IntegerType())).\
    withColumn('trip_distance', F.col('trip_distance').astype(FloatType())).\
    withColumn('extra', F.col('extra').astype(FloatType())).\
    withColumn('tip_amount', F.col('tip_amount').astype(FloatType())).\
    withColumn('tolls_amount', F.col('tolls_amount').astype(FloatType()))


In [None]:
dfs.printSchema()

## Create Columns

In [None]:
dfs = dfs.withColumn('trip_duration',F.col("dropoff_date").cast("long") - F.col('pickup_date').cast("long"))

## Filter Data

As we can see, there are a small number of records that appear to be errornous - due to their small size, I believe they should be excluded. 

In [None]:
dfs_cleaned = dfs.where("YEAR(pickup_date) <2019 AND YEAR(pickup_date) >2016 AND trip_duration > 0")

## Save data file

In [None]:
dfs_cleaned.write.mode('overwrite').parquet('combined_cleaned_data.parquet')

In [None]:
df_read = spark.read.parquet('combined_cleaned_data.parquet')

In [None]:
df_read.show()

In [None]:
spark.stop()