In [145]:
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sn
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [146]:
df_trips = pd.read_csv('trips_1.5M.csv')
df_trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount
0,1,2015-06-03 10:41:50,2015-06-03 11:08:33,1,2.8,-73.952888,40.776814,1,N,-73.979034,40.756611,1,17.5,0.0,0.5,4.57,0.0,22.87
1,2,2015-02-14 19:38:55,2015-02-14 20:00:59,1,2.79,-73.995827,40.725353,1,N,-73.984314,40.757389,2,15.0,0.0,0.5,0.0,0.0,15.8
2,2,2015-04-21 15:21:04,2015-04-21 15:33:38,1,1.75,-73.95192,40.769421,1,N,-73.956421,40.78714,2,9.5,0.0,0.5,0.0,0.0,10.3
3,2,2015-01-29 18:58:54,2015-01-29 19:04:46,3,0.83,-74.009026,40.71571,1,N,-74.003418,40.723152,1,5.5,1.0,0.5,1.46,0.0,8.76
4,1,2015-06-05 10:47:29,2015-06-05 10:58:00,3,0.8,-73.985909,40.756176,1,N,-73.988358,40.747398,2,7.0,0.0,0.5,0.0,0.0,7.8


In [147]:
df_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500000 entries, 0 to 1499999
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   VendorID               1500000 non-null  int64  
 1   tpep_pickup_datetime   1500000 non-null  object 
 2   tpep_dropoff_datetime  1500000 non-null  object 
 3   passenger_count        1500000 non-null  int64  
 4   trip_distance          1500000 non-null  float64
 5   pickup_longitude       1500000 non-null  float64
 6   pickup_latitude        1500000 non-null  float64
 7   RateCodeID             1500000 non-null  int64  
 8   store_and_fwd_flag     1500000 non-null  object 
 9   dropoff_longitude      1500000 non-null  float64
 10  dropoff_latitude       1500000 non-null  float64
 11  payment_type           1500000 non-null  int64  
 12  fare_amount            1500000 non-null  float64
 13  extra                  1500000 non-null  float64
 14  mta_tax           

In [148]:
# To reduce memory usage, we will limit the datatypes of the columns
df_trips[['VendorID', 'passenger_count', 'RateCodeID', 'payment_type']] = df_trips[['VendorID', 'passenger_count', 'RateCodeID', 'payment_type']].astype('int8')
df_trips[['fare_amount','dropoff_longitude', 'dropoff_latitude', 'pickup_longitude', 'pickup_latitude', 'extra', 'mta_tax', 'tip_amount','tolls_amount' ]] = df_trips[['fare_amount','dropoff_longitude', 'dropoff_latitude', 'pickup_longitude', 'pickup_latitude', 'extra', 'mta_tax', 'tip_amount','tolls_amount']].astype('float16')
df_trips['total_amount'] = df_trips['total_amount'].astype('float32')
df_trips['trip_distance'] = df_trips['trip_distance'].astype('float16')
df_trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500000 entries, 0 to 1499999
Data columns (total 18 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   VendorID               1500000 non-null  int8   
 1   tpep_pickup_datetime   1500000 non-null  object 
 2   tpep_dropoff_datetime  1500000 non-null  object 
 3   passenger_count        1500000 non-null  int8   
 4   trip_distance          1500000 non-null  float16
 5   pickup_longitude       1500000 non-null  float16
 6   pickup_latitude        1500000 non-null  float16
 7   RateCodeID             1500000 non-null  int8   
 8   store_and_fwd_flag     1500000 non-null  object 
 9   dropoff_longitude      1500000 non-null  float16
 10  dropoff_latitude       1500000 non-null  float16
 11  payment_type           1500000 non-null  int8   
 12  fare_amount            1500000 non-null  float16
 13  extra                  1500000 non-null  float16
 14  mta_tax           

In [149]:
# To predict the values, we will need to use the trip distances, which we currently do not have
# We can do this based on the two datetime columns

# First, turning the two columns into datetime objects
df_trips["tpep_pickup_datetime"] = pd.to_datetime(df_trips["tpep_pickup_datetime"])
df_trips["tpep_dropoff_datetime"] = pd.to_datetime(df_trips["tpep_dropoff_datetime"])

# Then, we can calculate the time length of the trip
df_trips['time_length'] = (df_trips["tpep_dropoff_datetime"] - df_trips["tpep_pickup_datetime"]).dt.total_seconds() / 3600
df_trips = df_trips.drop(columns=['tpep_dropoff_datetime', 'tpep_pickup_datetime', 'store_and_fwd_flag'])

df_trips

Unnamed: 0,VendorID,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,time_length
0,1,1,2.800781,-73.9375,40.78125,1,-74.0000,40.75000,1,17.5,0.0,0.5,4.570312,0.0,22.870001,0.445278
1,2,1,2.789062,-74.0000,40.71875,1,-74.0000,40.75000,2,15.0,0.0,0.5,0.000000,0.0,15.800000,0.367778
2,2,1,1.750000,-73.9375,40.78125,1,-73.9375,40.78125,2,9.5,0.0,0.5,0.000000,0.0,10.300000,0.209444
3,2,3,0.830078,-74.0000,40.71875,1,-74.0000,40.71875,1,5.5,1.0,0.5,1.459961,0.0,8.760000,0.097778
4,1,3,0.799805,-74.0000,40.75000,1,-74.0000,40.75000,2,7.0,0.0,0.5,0.000000,0.0,7.800000,0.175278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1499995,1,1,0.700195,-74.0000,40.75000,1,-74.0000,40.75000,1,4.5,0.0,0.5,1.549805,0.0,6.850000,0.050278
1499996,1,1,0.799805,-74.0000,40.75000,1,-74.0000,40.75000,1,4.5,0.5,0.5,1.000000,0.0,6.800000,0.036944
1499997,1,1,2.000000,-73.9375,40.78125,1,-74.0000,40.75000,1,9.5,0.0,0.5,2.050781,0.0,12.350000,0.164722
1499998,2,1,1.959961,-74.0000,40.71875,1,-74.0000,40.75000,1,12.0,1.0,0.5,2.759766,0.0,16.559999,0.271111


In [150]:
# To clean the data, we will remove rows with missing values and outliers by filtering the data on specific conditions (explained in the report)
print(f"Dataframe shape before cleaning: {df_trips.shape}")

df_trips_filtered = df_trips.loc[
    (df_trips['trip_distance'] < 21.0) & 
    (df_trips['trip_distance'] > 0.2) & 
    (df_trips['dropoff_longitude'] <= -73) & 
    (df_trips['dropoff_longitude'] > -75) & 
    (df_trips['dropoff_latitude'] >= 40) & 
    (df_trips['dropoff_latitude'] < 42) &
    (df_trips['pickup_longitude'] <= -73) & 
    (df_trips['pickup_longitude'] > -75) & 
    (df_trips['pickup_latitude'] >= 40) & 
    (df_trips['pickup_latitude'] < 42) &
    (df_trips['time_length'] > 0.02)
]

print(f"Dataframe shape after cleaning: {df_trips_filtered.shape}")

Dataframe shape before cleaning: (1500000, 16)
Dataframe shape after cleaning: (1450228, 16)


In [216]:
labels =df_trips_filtered['trip_distance']
inputs = df_trips_filtered[['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude']]
x_train, x_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.3, random_state=69)

In [217]:
clf_rf = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', n_jobs=-1, random_state=42, min_samples_leaf = 5)

#clf_rf = LinearRegression()

clf_rf.fit(x_train, y_train)


In [218]:
predictions_lr = clf_rf.predict(x_test)

In [219]:
print(x_test.count())

pickup_longitude     435069
pickup_latitude      435069
dropoff_longitude    435069
dropoff_latitude     435069
dtype: int64


In [220]:
nancount = np.isnan(predictions_lr).sum()
nancount

0

In [221]:
mae_lr = mean_absolute_error(predictions_lr, y_test)

print('random forest regression mae : ', mae_lr)

random forest regression mae :  0.7828358452082579
