In [123]:
import pandas as pd
import numpy as np
from uszipcode import SearchEngine
from uszipcode import Zipcode

In [124]:
def read_data_file(csv_path, row_count=None):
    dfcolumns = pd.read_csv(csv_path, nrows = 1)
    ncols = len(dfcolumns.columns)
    if row_count:
        df = pd.read_csv(csv_path, header = None, sep= ',', 
                     skiprows = 1, usecols = list(range(ncols)),
                     names = dfcolumns.columns, low_memory=False, nrows=row_count)
    else:
        df = pd.read_csv(csv_path, header = None, sep= ',', 
                         skiprows = 1, usecols = list(range(ncols)),
                         names = dfcolumns.columns, low_memory=False)
    return df

In [61]:
def get_columns(df, col_type="relevant"):
    cols = []
    dfcols = list(df.columns)
    if col_type == "relevant":
        subs_to_check = ['time', 'location', 'passenger','distance', 
                         'ratecode', 'fare', "longitude", "latitude"]
        for sub in subs_to_check:
            for col in dfcols:
                if sub.lower() in col.lower():
                    cols.append(col)
    
    elif col_type == "geolocation":
        subs_to_check = ["location", "longitude", "latitude"]
        for sub in subs_to_check:
            for col in dfcols:
                if sub.lower() in col.lower():
                    cols.append(col)
    return cols
    
    

In [62]:
files = ["green_tripdata_2018-06.csv",
         "green_tripdata_2014-05.csv", 
         "yellow_tripdata_2018-06.csv",
         "fhv_tripdata_2018-06.csv"]

In [63]:
df = read_data_file(files[1])

In [64]:
cols = get_columns(df)
print(cols)

['lpep_pickup_datetime', 'Lpep_dropoff_datetime', 'Passenger_count', 'Trip_distance', 'RateCodeID', 'Fare_amount', 'Pickup_longitude', 'Dropoff_longitude', 'Pickup_latitude', 'Dropoff_latitude']


In [69]:
df1 = df[cols]

In [70]:
df1.head()

Unnamed: 0,lpep_pickup_datetime,Lpep_dropoff_datetime,Passenger_count,Trip_distance,RateCodeID,Fare_amount,Pickup_longitude,Dropoff_longitude,Pickup_latitude,Dropoff_latitude
0,2014-05-01 00:00:00,2014-05-01 22:05:36,1,0.95,1,6.5,0.0,-73.977715,0.0,40.687542
1,2014-05-01 00:00:00,2014-05-01 07:52:17,1,1.95,1,9.0,0.0,0.0,0.0,0.0
2,2014-05-01 00:00:00,2014-05-01 10:50:16,1,5.65,1,26.5,0.0,0.0,0.0,0.0
3,2014-05-01 00:00:00,2014-05-01 20:50:04,1,6.88,1,24.0,0.0,0.0,0.0,0.0
4,2014-05-01 00:00:00,2014-05-01 10:35:50,1,4.46,1,21.0,0.0,0.0,0.0,0.0


In [71]:
df1.shape

(1421503, 10)

## Data Cleaning

### Getting rid of Null, NaN and Zero values

In [74]:
df1 = df1.replace(to_replace='None', value=np.nan).dropna()
print(df1.shape)
df1 = df1[(df1 != 0).all(1)]
print(df1.shape)

(1421503, 10)
(1394778, 10)


### Removing trips with invalid latitudes and longitudes

In [80]:
max_lat = 40.917577
min_lat = 40.477399 
max_long = -73.700272 
min_long = -74.259090
loc_cols = get_columns(df1, "geolocation")
print(loc_cols)
lat_cols = []
long_cols = []
for col in loc_cols:
    if "latitude" in col.lower():
        lat_cols.append(col)
    elif "longitude" in col.lower():
        long_cols.append(col)

for col in lat_cols:
    df1 = df1.loc[(df1[col] >= min_lat) & (df1[col] <= max_lat)]
    print(df1.shape)

for col in long_cols:
    df1 = df1.loc[(df1[col] >= min_long) & (df1[col] <= max_long)]
    print(df1.shape)


['Pickup_longitude', 'Dropoff_longitude', 'Pickup_latitude', 'Dropoff_latitude']
(1392741, 10)
(1392741, 10)
(1392741, 10)
(1392741, 10)


In [81]:
df1.head()

Unnamed: 0,lpep_pickup_datetime,Lpep_dropoff_datetime,Passenger_count,Trip_distance,RateCodeID,Fare_amount,Pickup_longitude,Dropoff_longitude,Pickup_latitude,Dropoff_latitude
16,2014-05-01 00:00:03,2014-05-01 00:14:13,1,4.5,1,15.0,-73.952393,-73.999512,40.694527,40.687595
18,2014-05-01 00:00:07,2014-05-01 00:03:55,1,0.73,1,5.0,-73.960716,-73.953217,40.807095,40.812603
19,2014-05-01 00:00:08,2014-05-01 00:08:28,2,2.36,1,9.0,-73.979698,-73.988487,40.682358,40.695953
20,2014-05-01 00:00:10,2014-05-01 00:32:50,1,5.7,1,25.5,-73.942085,-73.991982,40.716255,40.74918
21,2014-05-01 00:00:10,2014-05-01 00:04:46,1,0.91,1,5.5,-73.990845,-74.001717,40.692108,40.687016


### Removing all trips with Fare_amount less than min NYC taxi fare, i.e., \$2.5

In [82]:
df1 = df1.loc[df1["Fare_amount"] >= 2.5]
print(df1.shape)

(1392503, 10)


In [93]:
df1 = df1.reset_index(drop=True)
df1.describe()

Unnamed: 0,Passenger_count,Trip_distance,RateCodeID,Fare_amount,Pickup_longitude,Dropoff_longitude,Pickup_latitude,Dropoff_latitude
count,1392503.0,1392503.0,1392503.0,1392503.0,1392503.0,1392503.0,1392503.0,1392503.0
mean,1.449106,3.102241,1.056373,12.85475,-73.93291,-73.93324,40.75702,40.75392
std,1.162148,2.98046,0.4578227,9.549095,0.04174045,0.05047115,0.0575651,0.05777015
min,1.0,0.01,1.0,2.5,-74.24041,-74.2586,40.53865,40.51074
25%,1.0,1.17,1.0,6.5,-73.95829,-73.96766,40.71186,40.71245
50%,1.0,2.12,1.0,10.0,-73.94324,-73.94376,40.7517,40.75361
75%,1.0,3.99,1.0,16.0,-73.91342,-73.90691,40.80578,40.79868
max,9.0,305.67,6.0,900.0,-73.70093,-73.70034,40.91718,40.91757


## Feature Engineering

### Getting ZipCode from Latitude and Longitude

In [99]:
def get_zip_code(lat,long):
    search = SearchEngine(simple_zipcode=True)
    result = search.by_coordinates(lat, long, radius=5, returns=1)
    return result[0]

### Getting Historical Weather Information from Zipcode and Timestamp

In [95]:
def get_weather_from_that_day(zipcode, city, timestamp):
    print(zipcode, city, timestamp)
    #Need to get free weather API
    #fetch historical weather data


### Trip Frequency by time slices in a particular Zipcode radius 

In [96]:
def get_trip_frequency(df, curr_row, hrs=1):
    plat = row['Pickup_latitude']
    plong = row['Pickup_longitude']
    dlat = row['Dropoff_latitude']
    dlong = row['Dropoff_longitude']
    print("Pickup Lat : {} - Long: {} ---- Dropoff Lat : {} - Long: {}".format(plat,plong,dlat,dlong))

In [112]:
df1["zipcode"] = np.nan
df1["city"] = ""

In [136]:
# zip_codes = []
# cities = []
# for index, row in df1.iterrows():
#     lat = row['Pickup_latitude']
#     long = row['Pickup_longitude']
#     zipcode = get_zip_code(lat,long)
    
#     df1.at[index,'zipcode'] = zipcode.zipcode
#     df1.at[index,'city'] = zipcode.city
    

In [122]:
df1[df1['city'] != ""].shape

(254283, 12)

Unnamed: 0,lpep_pickup_datetime,Lpep_dropoff_datetime,Passenger_count,Trip_distance,RateCodeID,Fare_amount,Pickup_longitude,Dropoff_longitude,Pickup_latitude,Dropoff_latitude
16,2014-05-01 00:00:03,2014-05-01 00:14:13,1,4.5,1,15.0,-73.952393,-73.999512,40.694527,40.687595
18,2014-05-01 00:00:07,2014-05-01 00:03:55,1,0.73,1,5.0,-73.960716,-73.953217,40.807095,40.812603
19,2014-05-01 00:00:08,2014-05-01 00:08:28,2,2.36,1,9.0,-73.979698,-73.988487,40.682358,40.695953
20,2014-05-01 00:00:10,2014-05-01 00:32:50,1,5.7,1,25.5,-73.942085,-73.991982,40.716255,40.74918
21,2014-05-01 00:00:10,2014-05-01 00:04:46,1,0.91,1,5.5,-73.990845,-74.001717,40.692108,40.687016


In [92]:
df2.head()

Unnamed: 0,lpep_pickup_datetime,Lpep_dropoff_datetime,Passenger_count,Trip_distance,RateCodeID,Fare_amount,Pickup_longitude,Dropoff_longitude,Pickup_latitude,Dropoff_latitude
0,2014-05-01 00:00:03,2014-05-01 00:14:13,1,4.5,1,15.0,-73.952393,-73.999512,40.694527,40.687595
1,2014-05-01 00:00:07,2014-05-01 00:03:55,1,0.73,1,5.0,-73.960716,-73.953217,40.807095,40.812603
2,2014-05-01 00:00:08,2014-05-01 00:08:28,2,2.36,1,9.0,-73.979698,-73.988487,40.682358,40.695953
3,2014-05-01 00:00:10,2014-05-01 00:32:50,1,5.7,1,25.5,-73.942085,-73.991982,40.716255,40.74918
4,2014-05-01 00:00:10,2014-05-01 00:04:46,1,0.91,1,5.5,-73.990845,-74.001717,40.692108,40.687016


In [135]:
long = df1.iloc[0]['Pickup_longitude']
lat = df1.iloc[0]['Pickup_latitude']
date = df1.iloc[0]['lpep_pickup_datetime']
print(lat, long, date)

40.69452667236328 -73.952392578125 2014-05-01 00:00:03
