In this notebook, I clean up the raw data from April and March.  I filter the data even more, enrich the taxi trip data with weather data and the sunrise/sunset information, and save off training, validation, and testing data.

In [1]:
import pandas as pd
from datetime import datetime

pd.set_option('display.max_columns', None)

In [2]:
def getFloatOfPrecipitation(x):
    if x == "T":
        return float(0.00001)
    else:
        return float(x)

def getDate(d, format):
    if not isinstance(d, str):
        return None
    return datetime.strptime(d, format)

def calcSecondsBetween(pickup_time, dropoff_time):
    timediff = dropoff_time - pickup_time
    return (dropoff_time - pickup_time).total_seconds()

In [3]:
sun_data = pd.read_csv("data/2016SunriseSunset.txt", dtype=object)
weather_april = pd.read_csv("data/April2016Weather.txt")

weather_april["Precipitation"] = weather_april["Precipitation"].map(lambda x: getFloatOfPrecipitation(x))
weather_april["NewSnow"] = weather_april["NewSnow"].map(lambda x: getFloatOfPrecipitation(x))

weather_april["Date"] = weather_april["Date"].map(lambda x: getDate(x, "%Y-%m-%d"))
weather_april["Month"] = weather_april["Date"].map(lambda x: x.month)
weather_april["DayNum"] = weather_april["Date"].map(lambda x: x.day)

weather_april = weather_april.drop(labels = ["HDD", "CDD", "Departure", "SnowDepth", "MaxTemp", "MinTemp", "Date"], axis = 1)

weather_april_dict = {}
for i in weather_april.values:
    weather_april_dict[(i[3], i[4])] = (i[0], i[1], i[2])
    
april_sun_data = sun_data[["DA", "4RIS", "4SET"]].dropna()
april_sun_data["DA"] = april_sun_data["DA"].map(lambda x: getDate(x, "%d").day)
april_sun_data["4RIS"] = april_sun_data["4RIS"].map(lambda x: getDate(x, "%H%M"))
april_sun_data["4SET"] = april_sun_data["4SET"].map(lambda x: getDate(x, "%H%M"))

sun_april_dict = {}
for i in april_sun_data.values:
    sun_april_dict[i[0]] = (i[1], i[2])

There are a little over 11000 records of taxis that have a total trip time of over 3 hours.  While it is conceivable that someone might buyout a taxi for a day and drive multiple places, this is a difficult situation to predict.  On the other hand, 3 hours of traffic seems possible, so I will keep in trips lasting up until 3 hours.

In [4]:
green_taxi_april = pd.read_pickle("data/allAprilData")

taxi_april_weather_join_series = green_taxi_april["lpep_pickup_datetime"].map(lambda x: weather_april_dict[(x.month, x.day)])
green_taxi_april["AvgTemp"] = taxi_april_weather_join_series.map(lambda x: x[0])
green_taxi_april["Precipitation"] = taxi_april_weather_join_series.map(lambda x: x[1])
green_taxi_april["NewSnow"] = taxi_april_weather_join_series.map(lambda x: x[2])

green_taxi_april["trip_length_seconds"] = green_taxi_april[["lpep_pickup_datetime" ,"Lpep_dropoff_datetime"]].apply(lambda x: calcSecondsBetween(*x), axis=1)
green_taxi_april["trip_length_minutes_rounded"] = round(green_taxi_april["trip_length_seconds"]/60)

green_taxi_april = green_taxi_april[(green_taxi_april["trip_length_seconds"] > 30) & (green_taxi_april["trip_length_minutes_rounded"] <= 180)]

green_taxi_april["pickup_minute"] = green_taxi_april["lpep_pickup_datetime"].map(lambda x: x.minute)
green_taxi_april["pickup_hour"] = green_taxi_april["lpep_pickup_datetime"].map(lambda x: x.hour)
green_taxi_april["pickup_day"] = green_taxi_april["lpep_pickup_datetime"].map(lambda x: x.day)
green_taxi_april["pickup_weekday"] = green_taxi_april["lpep_pickup_datetime"].map(lambda x: x.weekday())

green_taxi_april["dropoff_minute"] = green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: x.minute)
green_taxi_april["dropoff_hour"] = green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: x.hour)
green_taxi_april["dropoff_day"] = green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: x.day)
green_taxi_april["dropoff_weekday"] = green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: x.weekday())

In [5]:
def mapStoreFwdFlag(x):
    if x == "N":
        return 0
    else:
        return 1
    
green_taxi_april["Store_and_fwd_flag"] = green_taxi_april["Store_and_fwd_flag"].map(lambda x: mapStoreFwdFlag(x))

In [6]:
def relevantTimeToAprilSunsetSunrise(x):
    rise_set = sun_april_dict[x.day]
    time_zeroed = datetime(1900,1,1,x.hour,x.minute)
    if x.hour < 12: #Before noon so closer to sunrise
        if time_zeroed >= rise_set[0]:
            diff = time_zeroed - rise_set[0]
            diff_minute = diff.total_seconds() / 60.0
            if diff_minute < 90:
                return (diff_minute, 0)
            else :
                return (0,0)
        else : #Still before sunrise
            return (0,0)    
    else : #After noon so closer to sunset
        if time_zeroed <= rise_set[1]:
            diff = rise_set[1] - time_zeroed
            diff_minute = diff.total_seconds() / 60.0
            if diff_minute < 90:
                return (0, diff_minute)
            else :
                return (0,0)
        else : #After sunset
            return (0,0)    
    
pickup_sun_series = green_taxi_april["lpep_pickup_datetime"].map(lambda x: relevantTimeToAprilSunsetSunrise(x))
dropoff_sun_series = green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: relevantTimeToAprilSunsetSunrise(x))

green_taxi_april["pickup_sunrise"] = pickup_sun_series.map(lambda x: x[0])
green_taxi_april["pickup_sunset"] = pickup_sun_series.map(lambda x: x[1])
green_taxi_april["dropoff_sunrise"] = dropoff_sun_series.map(lambda x: x[0])
green_taxi_april["dropoff_sunset"] = dropoff_sun_series.map(lambda x: x[1])

In [7]:
# green_taxi_april = green_taxi_april.drop(labels = ["lpep_pickup_datetime", "Lpep_dropoff_datetime"], axis = 1)
green_taxi_april.to_pickle("data/allAprilEnriched")
green_taxi_april[green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: x.day) >= 24].to_pickle("data/lateAprilDataEnriched")

In [8]:
earlyAprilData = green_taxi_april[green_taxi_april["Lpep_dropoff_datetime"].map(lambda x: x.day) < 24]
len(earlyAprilData)

1159826

Since I'm going to work with a neural network, I want to provide it with as much training data as possible.  In my case, I'm only going to select 100000 rows as validation data to use which is about 8.6% of the early April data leaving me with over 1 million rows for training, which is 93.4% of the early April data.

In [9]:
validationData = earlyAprilData.sample(100000, random_state = 56789)
trainingData = earlyAprilData.drop(labels = validation.index.values, axis = 0)
trainingData.to_pickle("data/earlyAprilDataEnriched")
validationData.to_pickle("data/validationAprilDataEnriched")

I am curious to know how well a model built for April will be able to predict data for March, so I'm going to clean up, enrich, and save off March data as well.

In [10]:
weather_march = pd.read_csv("data/March2016Weather.txt")

weather_march["Precipitation"] = weather_march["Precipitation"].map(lambda x: getFloatOfPrecipitation(x))
weather_march["NewSnow"] = weather_march["NewSnow"].map(lambda x: getFloatOfPrecipitation(x))

weather_march["Date"] = weather_march["Date"].map(lambda x: getDate(x, "%Y-%m-%d"))
weather_march["Month"] = weather_march["Date"].map(lambda x: x.month)
weather_march["DayNum"] = weather_march["Date"].map(lambda x: x.day)

weather_march = weather_march.drop(labels = ["HDD", "CDD", "Departure", "SnowDepth", "MaxTemp", "MinTemp", "Date"], axis = 1)

weather_march_dict = {}
for i in weather_march.values:
    weather_march_dict[(i[3], i[4])] = (i[0], i[1], i[2])
    
march_sun_data = sun_data[["DA", "3RIS", "3SET"]].dropna()
march_sun_data["DA"] = march_sun_data["DA"].map(lambda x: getDate(x, "%d").day)
march_sun_data["3RIS"] = march_sun_data["3RIS"].map(lambda x: getDate(x, "%H%M"))
march_sun_data["3SET"] = march_sun_data["3SET"].map(lambda x: getDate(x, "%H%M"))

sun_march_dict = {}
for i in march_sun_data.values:
    sun_march_dict[i[0]] = (i[1], i[2])

In [11]:
green_taxi_march = pd.read_pickle("data/allMarchData")

taxi_march_weather_join_series = green_taxi_march["lpep_pickup_datetime"].map(lambda x: weather_march_dict[(x.month, x.day)])
green_taxi_march["AvgTemp"] = taxi_march_weather_join_series.map(lambda x: x[0])
green_taxi_march["Precipitation"] = taxi_march_weather_join_series.map(lambda x: x[1])
green_taxi_march["NewSnow"] = taxi_march_weather_join_series.map(lambda x: x[2])

green_taxi_march["trip_length_seconds"] = green_taxi_march[["lpep_pickup_datetime" ,"Lpep_dropoff_datetime"]].apply(lambda x: calcSecondsBetween(*x), axis=1)
green_taxi_march["trip_length_minutes_rounded"] = round(green_taxi_march["trip_length_seconds"]/60)

green_taxi_march = green_taxi_march[(green_taxi_march["trip_length_seconds"] > 30) & (green_taxi_march["trip_length_minutes_rounded"] <= 180)]

green_taxi_march["pickup_minute"] = green_taxi_march["lpep_pickup_datetime"].map(lambda x: x.minute)
green_taxi_march["pickup_hour"] = green_taxi_march["lpep_pickup_datetime"].map(lambda x: x.hour)
green_taxi_march["pickup_day"] = green_taxi_march["lpep_pickup_datetime"].map(lambda x: x.day)
green_taxi_march["pickup_weekday"] = green_taxi_march["lpep_pickup_datetime"].map(lambda x: x.weekday())

green_taxi_march["dropoff_minute"] = green_taxi_march["Lpep_dropoff_datetime"].map(lambda x: x.minute)
green_taxi_march["dropoff_hour"] = green_taxi_march["Lpep_dropoff_datetime"].map(lambda x: x.hour)
green_taxi_march["dropoff_day"] = green_taxi_march["Lpep_dropoff_datetime"].map(lambda x: x.day)
green_taxi_march["dropoff_weekday"] = green_taxi_march["Lpep_dropoff_datetime"].map(lambda x: x.weekday())

In [12]:
green_taxi_march["Store_and_fwd_flag"] = green_taxi_march["Store_and_fwd_flag"].map(lambda x: mapStoreFwdFlag(x))

In [13]:
def relevantTimeToAprilSunsetSunrise(x):
    rise_set = sun_march_dict[x.day]
    time_zeroed = datetime(1900,1,1,x.hour,x.minute)
    if x.hour < 12: #Before noon so closer to sunrise
        if time_zeroed >= rise_set[0]:
            diff = time_zeroed - rise_set[0]
            diff_minute = diff.total_seconds() / 60.0
            if diff_minute < 90:
                return (diff_minute, 0)
            else :
                return (0,0)
        else : #Still before sunrise
            return (0,0)    
    else : #After noon so closer to sunset
        if time_zeroed <= rise_set[1]:
            diff = rise_set[1] - time_zeroed
            diff_minute = diff.total_seconds() / 60.0
            if diff_minute < 90:
                return (0, diff_minute)
            else :
                return (0,0)
        else : #After sunset
            return (0,0)    
    
pickup_sun_series_march = green_taxi_march["lpep_pickup_datetime"].map(lambda x: relevantTimeToAprilSunsetSunrise(x))
dropoff_sun_series_march = green_taxi_march["Lpep_dropoff_datetime"].map(lambda x: relevantTimeToAprilSunsetSunrise(x))

green_taxi_march["pickup_sunrise"] = pickup_sun_series_march.map(lambda x: x[0])
green_taxi_march["pickup_sunset"] = pickup_sun_series_march.map(lambda x: x[1])
green_taxi_march["dropoff_sunrise"] = dropoff_sun_series_march.map(lambda x: x[0])
green_taxi_march["dropoff_sunset"] = dropoff_sun_series_march.map(lambda x: x[1])

In [14]:
# green_taxi_march = green_taxi_march.drop(labels = ["lpep_pickup_datetime", "Lpep_dropoff_datetime"], axis = 1)

green_taxi_march.to_pickle("data/allMarchEnriched")