## Import packages

In [1]:
import sys
import boto3 
import pandas as pd
import numpy as np
import s3fs
import StringIO
from uszipcode import SearchEngine
from uszipcode import Zipcode
from datetime import datetime, timedelta

In [2]:
zipcodeMap = {1: 7114, 2: 11430, 3: 10469, 4: 10009, 5: 10309, 6: 10305, 7: 11101, 8: 11105, 9: 11358, 10: 11434, 11: 11214, 12: 10280, 13: 10280, 14: 11209, 15: 11359, 16: 11361, 17: 10506, 18: 10458, 19: 11426, 20: 14813, 21: 11206, 22: 11207, 23: 10314, 24: 12913, 25: 11201, 26: 11212, 27: 11697, 28: 11435, 29: 11235, 30: 11693, 31: 10453, 32: 10462, 33: 11201, 34: 11251, 35: 11212, 36: 11221, 37: 11237, 38: 11411, 39: 11234, 40: 11231, 41: 10026, 42: 10027, 43: 10019, 44: 10309, 45: 10013, 46: 10464, 47: 10457, 48: 10001, 49: 11205, 50: 10002, 51: 10475, 52: 11201, 53: 11356, 54: 11231, 55: 11224, 56: 11368, 57: 11368, 58: 10465, 59: 10457, 60: 10459, 61: 11238, 62: 11205, 63: 11208, 64: 11363, 65: 11201, 66: 11201, 67: 11228, 68: 10019, 69: 10451, 70: 11369, 71: 11203, 72: 11236, 73: 11355, 74: 10035, 75: 10029, 76: 10003, 77: 11207, 78: 10457, 79: 10211, 80: 11211, 81: 10466, 82: 11373, 83: 11378, 84: 10308, 85: 11226, 86: 11691, 87: 10005, 88: 10006, 89: 11226, 90: 10010, 91: 11239, 92: 11355, 93: 11368, 94: 10468, 95: 11375, 96: 11385, 97: 11205, 98: 11365, 99: 10312, 100: 10018, 101: 11004, 102: 11385, 103: 10012, 104: 10012, 105: 10012, 106: 11215, 107: 10016, 108: 11223, 109: 10308, 110: 10306, 111: 11232, 112: 11222, 113: 10012, 114: 10013, 115: 10301, 116: 10031, 117: 11692, 118: 10314, 119: 10452, 120: 10034, 121: 11366, 122: 11423, 123: 11229, 124: 11414, 125: 10006, 126: 10474, 127: 10034, 128: 10031, 129: 11372, 130: 11412, 131: 11423, 132: 11430, 133: 11218, 134: 11415, 135: 11367, 136: 10468, 137: 10016, 138: 11371, 139: 11413, 140: 10021, 141: 10065, 142: 10023, 143: 10024, 144: 10013, 145: 11101, 146: 11101, 147: 10459, 148: 10009, 149: 11201, 150: 11235, 151: 10025, 152: 10027, 153: 10463, 154: 11234, 155: 11234, 156: 10303, 157: 11378, 158: 10014, 159: 10456, 160: 11379, 161: 10018, 162: 10022, 163: 10018, 164: 10017, 165: 11230, 166: 10027, 167: 10456, 168: 10451, 169: 10453, 170: 10010, 171: 11354, 172: 10306, 173: 11368, 174: 10467, 175: 11364, 176: 10306, 177: 11233, 178: 11230, 179: 11103, 180: 11416, 181: 11215, 182: 10462, 183: 10461, 184: 10461, 185: 10461, 186: 10016, 187: 10302, 188: 11225, 189: 11238, 190: 11215, 191: 11427, 192: 11355, 193: 11101, 194: 10035, 195: 11231, 196: 11374, 197: 11418, 198: 11385, 199: 11370, 200: 10471, 201: 11694, 202: 10044, 203: 11422, 204: 10309, 205: 11412, 206: 11378, 207: 11370, 208: 10469, 209: 10013, 210: 11235, 211: 10013, 212: 10472, 213: 10472, 214: 10305, 215: 11435, 216: 11420, 217: 11221, 218: 11413, 219: 11413, 220: 10463, 221: 10304, 222: 11239, 223: 11105, 224: 10009, 225: 11205, 226: 11104, 227: 11211, 228: 11232, 229: 10022, 230: 10036, 231: 10007, 232: 10002, 233: 10022, 234: 10003, 235: 10453, 236: 10021, 237: 10028, 238: 10044, 239: 10065, 240: 10463, 241: 10463, 242: 10461, 243: 10032, 244: 10034, 245: 10310, 246: 10036, 247: 10451, 248: 10460, 249: 10014, 250: 10462, 251: 10314, 252: 11357, 253: 11357, 254: 10467, 255: 10467, 256: 10467, 257: 11215, 258: 11421, 259: 10470, 260: 11377, 261: 10048, 262: 10028, 263: 10028, 264: '', 265: ''}

## Read file and extract a map of required columns

In [3]:
def readfile(filename, row_count=None):
    # datafile="s3n://nyc-tlc/trip data/green_tripdata_2018-06.csv" # has area codes
    # datafile="s3n://nyc-tlc/trip data/yellow_tripdata_2015-07.csv" # has lat/long data
    df = pd.read_csv(filename)
    dfcolumns = pd.read_csv(filename, nrows = 1)
    ncols = len(dfcolumns.columns)
    if row_count:
        df = pd.read_csv(filename, header = None, sep= ',', 
                     skiprows = 1, usecols = list(range(ncols)),
                     names = dfcolumns.columns, low_memory=False, nrows=row_count)
    else:
        df = pd.read_csv(filename, header = None, sep= ',', 
                         skiprows = 1, usecols = list(range(ncols)),
                         names = dfcolumns.columns, low_memory=False)
    
    column_map = {}
    useful_columns = ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "pickup_datetime", "dropoff_datetime", "dolocationid", "pulocationid", "fare_amount", "trip_distance", "passenger_count"]
    col_names = set(df.columns)
    drop_columns = []
    for c in col_names:
        cl = c.lower()
        for u in useful_columns:
            if cl in u or u in cl:
                column_map[c] = u
        if c not in column_map:
            drop_columns.append(c)
    print(drop_columns)
    df = df.drop(drop_columns, axis=1)

    df.rename(columns=column_map, inplace=True)
    return df

In [4]:
def read_data_file(csv_path, row_count=None):
    dfcolumns = pd.read_csv(csv_path, nrows = 1)
    ncols = len(dfcolumns.columns)
    if row_count:
        df = pd.read_csv(csv_path, header = None, sep= ',', 
                     skiprows = 1, usecols = list(range(ncols)),
                     names = dfcolumns.columns, low_memory=False, nrows=row_count)
    else:
        df = pd.read_csv(csv_path, header = None, sep= ',', 
                         skiprows = 1, usecols = list(range(ncols)),
                         names = dfcolumns.columns, low_memory=False)
    return df

In [5]:
def get_columns(df, col_type="relevant"):
    cols = []
    dfcols = list(df.columns)
    if col_type == "relevant":
        subs_to_check = ['time', 'location', 'passenger','distance', 
                         'ratecode', 'fare', "longitude", "latitude"]
        for sub in subs_to_check:
            for col in dfcols:
                if sub.lower() in col.lower():
                    cols.append(col)
    
    elif col_type == "geolocation":
        subs_to_check = ["location", "longitude", "latitude"]
        for sub in subs_to_check:
            for col in dfcols:
                if sub.lower() in col.lower():
                    cols.append(col)
    return cols

## Sanity check - all the required columns should be present

In [6]:
def sanityCheck(columns):
    required_columns = ["pickup_datetime", "dropoff_datetime", "fare_amount", "trip_distance", "passenger_count"]
    for c in required_columns:
        if c not in columns:
            print("Required column {} not found in the data. Exiting.".format(c))
            return False

    exit = False
    for c in ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]:
        if c not in columns:
            exit = True

    if exit:
        for c in ["dolocationid", "pulocationid"]:
            if c not in columns:
                print("Required columns for location not found in the data. Exiting.".format(c))
                return False

    print("Sanity check complete. Data can be preprocessed.")
    return True


## Feature engineering - add zipcode

In [7]:
def get_zip_code(latitude,longitude):
    try:
        search = SearchEngine(simple_zipcode=True)
        result = search.by_coordinates(latitude, longitude, radius=5, returns=1)
        return result[0].zipcode
    except ValueError as e:
        return 10001

In [8]:
def applyZipCode(lat, long):
    zipcode = get_zip_code(lat, long)
    return zipcode

In [9]:
def get_loc_id_zipcode(loc_id):
    return zipcodeMap[loc_id]

In [10]:
def generate_zipcode_columns(df):
    if 'pickup_latitude' in df.columns:
        df['pickup_zipcode'] = df.apply(lambda row: applyZipCode(row.pickup_latitude, row.pickup_longitude), axis=1)
        df['dropoff_zipcode'] = df.apply(lambda row: applyZipCode(row.dropoff_latitude, row.dropoff_longitude), axis=1)
        df.drop(["pickup_longitude","pickup_latitude", "dropoff_longitude", "dropoff_latitude"], axis=1, inplace=True)
    if 'dolocationid' in df.columns:
        df['pickup_zipcode'] = df.apply(lambda row: get_loc_id_zipcode(row.pulocationid), axis=1)
        df['dropoff_zipcode'] = df.apply(lambda row: get_loc_id_zipcode(row.dolocationid), axis=1)
        df.drop(["dolocationid", "pulocationid"], axis=1, inplace=True)
    return df

## Remove NaN values

In [11]:
def remove_nan_values(df):
    df = df.replace(to_replace='None', value=np.nan).dropna()
    print(df.shape)
    df = df[(df != 0).all(1)]
    print(df.shape)
    return df

## Remove invalid latitude/longitude

In [12]:
def remove_invalid_lat_long(df):
    max_lat = 40.917577
    min_lat = 40.477399 
    max_long = -73.700272 
    min_long = -74.259090
    loc_cols = ["pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]
    print(loc_cols)
    lat_cols = []
    long_cols = []
    for column in loc_cols:
        if "latitude" in column.lower():
            lat_cols.append(column)
        elif "longitude" in column.lower():
            long_cols.append(column)

    for col in lat_cols:
        df = df.loc[(df[col] >= min_lat) & (df[col] <= max_lat)]
        print(df.shape)

    for col in long_cols:
        df = df.loc[(df[col] >= min_long) & (df[col] <= max_long)]
        print(df.shape)
    
    return df

## Remove trips with invalid fare

In [13]:
def remove_invalid_fare_trips(df):
    df = df.loc[(df['fare_amount'] >= 2.5)]
    return df

## Fix Column Datatypes

In [14]:
def fix_column_datatypes(df):
    date_columns = ['pickup_datetime', 'dropoff_datetime']
    numeric_columns = ['passenger_count', 'trip_distance', 'fare_amount', 'pickup_zipcode', 'dropoff_zipcode']
    df[date_columns] = df[date_columns].apply(pd.to_datetime)
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)
    return df

## Generate Datetime Features

In [15]:
def generate_datetime_features(df):
#     df['pickup_date']= df['pickup_datetime'].dt.date
    df['pickup_day']=df['pickup_datetime'].apply(lambda x:x.day)
    df['pickup_hour']=df['pickup_datetime'].apply(lambda x:x.hour)
    df['pickup_day_of_week']=df['pickup_datetime'].apply(lambda x:x.weekday())
    df['pickup_month']=df['pickup_datetime'].apply(lambda x:x.month)
    df['pickup_year']=df['pickup_datetime'].apply(lambda x:x.year)
    
#     df['dropoff_date']= df['dropoff_datetime'].dt.date
#     df['dropoff_day']=df['dropoff_datetime'].apply(lambda x:x.day)
#     df['dropoff_hour']=df['dropoff_datetime'].apply(lambda x:x.hour)
#     df['dropoff_day_of_week']=df['dropoff_datetime'].apply(lambda x:x.weekday())
#     df['dropoff_month']=df['dropoff_datetime'].apply(lambda x:x.month)
#     df['dropoff_year']=df['dropoff_datetime'].apply(lambda x:x.year)
    
    return df

## Write DF back to S3

In [16]:
def writeDFtoS3(bucket, key, df):
    # Write dataframe to buffer
    csv_buffer = StringIO.StringIO()
    df.to_csv(csv_buffer, index=False)

    # Upload CSV to S3
    s3 = boto3.client("s3")
    s3.put_object(Bucket=bucket, Key=key, Body=csv_buffer.getvalue())

## Handler function date preprocessing

In [17]:
def handler(filenames):
    inputBucket = 'nyc-tlc'
    outputBucket = 'dic-taxi-fare-prediction'
    count = 0
    for filename in filenames:
        try:
            df = readfile("s3n://{}/{}".format(inputBucket, filename))
            if not sanityCheck(set(df.columns)):
                print("Sanity check failed")
            else:
                print("Sanity check succeeded")
            df = remove_nan_values(df)
            if "pickup_longitude" in df.columns:
                df = remove_invalid_lat_long(df)
            df = remove_invalid_fare_trips(df)
            df = generate_zipcode_columns(df)
            df = fix_column_datatypes(df)
            df = generate_datetime_features(df)
            df['trip_time'] = df.apply(lambda row: (row.dropoff_datetime - row.pickup_datetime).seconds, axis=1)
            df = df.reset_index(drop=True)
            writeDFtoS3(outputBucket, filename, df)
            count += 1
        except: # until the lat/long logic is fixed
            continue
        
    return [count]

## Read files names to be processed

In [None]:
def getFilenames():
    bucket = 'nyc-tlc'
    prefix = "trip data"
    trip_type = "green_tripdata"

    s3 = boto3.client("s3")
    objects = s3.list_objects(Bucket=bucket, Prefix=prefix)
    tripdata_files = []

    if objects["ResponseMetadata"] and objects["ResponseMetadata"]["HTTPStatusCode"] == 200:
        for obj in objects['Contents']:
            if "Key" in obj and trip_type in obj["Key"]:
                tripdata_files.append(obj["Key"])
        print("Found {} under path s3://{}/{} for {}".format(len(tripdata_files), bucket, prefix, trip_type))
    else:
        print("Could not read files under s3://{}/{}. Exiting.".format(bucket, prefix))

    return tripdata_files

## Parallelize file processing 

In [None]:
tripdata_files = getFilenames()
rdd = sc.parallelize(tripdata_files, 16) # 16 = number total cores of workers in the cluster
rdd.mapPartitions(handler).count()
# rdd.getNumPartitions()
# dir(rdd)
# print("Default parallelism: {}".format(sc.defaultParallelism))
# print("Number of partitions: {}".format(rdd.getNumPartitions()))
# print("Partitioner: {}".format(rdd.partitioner))
# print("Partitions structure: {}".format(rdd.glom().collect()))
