In [24]:
import pandas as pd
import numpy as np
from uszipcode import SearchEngine
from uszipcode import Zipcode
from datetime import datetime, timedelta

In [6]:
def read_data_file(csv_path, row_count=None):
    dfcolumns = pd.read_csv(csv_path, nrows = 1)
    ncols = len(dfcolumns.columns)
    if row_count:
        df = pd.read_csv(csv_path, header = None, sep= ',', 
                     skiprows = 1, usecols = list(range(ncols)),
                     names = dfcolumns.columns, low_memory=False, nrows=row_count)
    else:
        df = pd.read_csv(csv_path, header = None, sep= ',', 
                         skiprows = 1, usecols = list(range(ncols)),
                         names = dfcolumns.columns, low_memory=False)
    return df

In [7]:
# spark.read().csv("s3://")

In [8]:
def get_columns(df, col_type="relevant"):
    cols = []
    dfcols = list(df.columns)
    if col_type == "relevant":
        subs_to_check = ['time', 'location', 'passenger','distance', 
                         'ratecode', 'fare', "longitude", "latitude"]
        for sub in subs_to_check:
            for col in dfcols:
                if sub.lower() in col.lower():
                    cols.append(col)
    
    elif col_type == "geolocation":
        subs_to_check = ["location", "longitude", "latitude"]
        for sub in subs_to_check:
            for col in dfcols:
                if sub.lower() in col.lower():
                    cols.append(col)
    return cols
    
    

## Data Cleaning

### Getting rid of Null, NaN and Zero values

In [16]:
def remove_nan_values(df):
    df = df.replace(to_replace='None', value=np.nan).dropna()
    print(df.shape)
    df = df[(df != 0).all(1)]
    print(df.shape)
    return df

### Removing trips with invalid latitudes and longitudes

In [17]:
def remove_invalid_lat_long(df):
    max_lat = 40.917577
    min_lat = 40.477399 
    max_long = -73.700272 
    min_long = -74.259090
    loc_cols = get_columns(df, "geolocation")
    print(loc_cols)
    lat_cols = []
    long_cols = []
    for col in loc_cols:
        if "latitude" in col.lower():
            lat_cols.append(col)
        elif "longitude" in col.lower():
            long_cols.append(col)

    for col in lat_cols:
        df = df.loc[(df[col] >= min_lat) & (df[col] <= max_lat)]
        print(df.shape)

    for col in long_cols:
        df = df.loc[(df[col] >= min_long) & (df[col] <= max_long)]
        print(df.shape)
    return df


### Removing all trips with Fare_amount less than min NYC taxi fare, i.e., \$2.5

In [77]:
def remove_invalid_fare_trips(df):
    df = df.loc[df1["Fare_amount"] >= 2.5]
    print(df.shape)
    return df

In [80]:
def fix_column_datatypes(df, date_columns=[], numeric_columns=[]):
    df[date_columns] = df[date_columns].apply(pd.to_datetime)
    df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)
    return df

## Feature Engineering

### Getting ZipCode from Latitude and Longitude

In [48]:
def get_zip_code(lat,long):
    search = SearchEngine(simple_zipcode=True)
    result = search.by_coordinates(lat, long, radius=5, returns=1)
    return result[0]

In [78]:
def fill_zip_codes(df):
    for index, row in df.iterrows():
        lat = row['Pickup_latitude']
        long = row['Pickup_longitude']
        zipcode = get_zip_code(lat,long)

        df.at[index,'zipcode'] = zipcode.zipcode
#         df2.at[index,'city'] = zipcode.city
    return df

### Trip Frequency by time slices in a particular Zipcode radius 

In [70]:
def get_trip_frequency(df, curr_row, time_col, hrs=1):
    print(curr_row)
    start_time = curr_row[time_col] - timedelta(hours=hrs)
    end_time = curr_row[time_col]
    current_zip = curr_row['zipcode']
    rides_between = df[df[time_col].between(start_time,end_time, inclusive=True)]
    rides_between_same_zip = rides_between[rides_between['zipcode'] == current_zip]
    return len(rides_between_same_zip)

In [82]:
def fill_trip_frequencies(df, hrs=[]):
    print("try")
        

## Main

In [9]:
files = ["green_tripdata_2018-06.csv",
         "green_tripdata_2014-05.csv", 
         "yellow_tripdata_2018-06.csv",
         "fhv_tripdata_2018-06.csv"]

In [61]:
df = read_data_file(files[1])

In [11]:
cols = get_columns(df)
print(cols)

['lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Passenger_count', 'Trip_distance', 'RateCodeID', 'Fare_amount', 'Pickup_longitude', 'Dropoff_longitude', 'Pickup_latitude', 'Dropoff_latitude']


In [12]:
df1 = df[cols]

In [13]:
df1.head()

Unnamed: 0,lpep_pickup_datetime,Lpep_dropoff_datetime,Passenger_count,Trip_distance,RateCodeID,Fare_amount,Pickup_longitude,Dropoff_longitude,Pickup_latitude,Dropoff_latitude
0,2014-05-01 00:00:00,2014-05-01 22:05:36,1,0.95,1,6.5,0.0,-73.977715,0.0,40.687542
1,2014-05-01 00:00:00,2014-05-01 07:52:17,1,1.95,1,9.0,0.0,0.0,0.0,0.0
2,2014-05-01 00:00:00,2014-05-01 10:50:16,1,5.65,1,26.5,0.0,0.0,0.0,0.0
3,2014-05-01 00:00:00,2014-05-01 20:50:04,1,6.88,1,24.0,0.0,0.0,0.0,0.0
4,2014-05-01 00:00:00,2014-05-01 10:35:50,1,4.46,1,21.0,0.0,0.0,0.0,0.0


In [None]:
df1 = remove_nan_values(df1)
df1 = remove_invalid_lat_long(df1)
df1 = remove_invalid_fare_trips(df1)

In [None]:
df1 = df1.reset_index(drop=True)

In [None]:
date_columns = []
numeric_columns = []
df1 = fix_column_datatypes(df1, date_columns, numeric_columns)

In [10]:
df1 = fill_zip_codes(df1)