In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:
df = pd.read_csv("nyc_taxi_trip_duration.csv", parse_dates=["pickup_datetime", "dropoff_datetime"])


In [13]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,is_weekend,distance_km
0,id1080784,1,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,0,400,2,29,0,16,40,0,1.199073
1,id0889885,0,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,0,1100,3,11,4,23,35,0,4.129111
2,id0857912,1,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,0,1635,2,21,6,17,59,1,7.250753
3,id3744273,1,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,0,1141,1,5,1,9,44,0,2.361097
4,id0232939,0,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,0,848,2,17,2,6,42,0,4.328534


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 725381 entries, 0 to 729321
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  725381 non-null  object        
 1   vendor_id           725381 non-null  int8          
 2   pickup_datetime     725381 non-null  datetime64[ns]
 3   dropoff_datetime    725381 non-null  datetime64[ns]
 4   passenger_count     725381 non-null  int64         
 5   pickup_longitude    725381 non-null  float64       
 6   pickup_latitude     725381 non-null  float64       
 7   dropoff_longitude   725381 non-null  float64       
 8   dropoff_latitude    725381 non-null  float64       
 9   store_and_fwd_flag  725381 non-null  int64         
 10  trip_duration       725381 non-null  int64         
 11  pickup_month        725381 non-null  int32         
 12  pickup_day          725381 non-null  int32         
 13  pickup_weekday      725381 non-nul

In [15]:
df.describe()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,is_weekend,distance_km
count,725381.0,725381,725381,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0,725381.0
mean,0.534372,2016-04-01 11:17:10.352323840,2016-04-01 11:31:08.359264256,1.661258,-73.973603,40.750932,-73.973498,40.751808,0.005523,838.00694,3.51838,15.502639,3.048744,13.612042,29.59902,0.285081,3.448168
min,0.0,2016-01-01 00:01:14,2016-01-01 00:05:54,0.0,-121.933342,35.310307,-121.933304,35.173546,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.000424
25%,0.0,2016-02-17 18:51:18,2016-02-17 19:04:14,1.0,-73.991867,40.737373,-73.991318,40.735966,0.0,398.0,2.0,8.0,1.0,9.0,15.0,0.0,1.241217
50%,1.0,2016-04-01 17:23:04,2016-04-01 17:37:24,1.0,-73.981773,40.754097,-73.979774,40.754532,0.0,663.0,4.0,15.0,3.0,14.0,30.0,0.0,2.104309
75%,1.0,2016-05-15 07:08:30,2016-05-15 07:17:33,2.0,-73.967438,40.768314,-73.963104,40.769741,0.0,1074.0,5.0,23.0,5.0,19.0,45.0,1.0,3.887425
max,1.0,2016-06-30 23:59:37,2016-07-01 00:46:37,7.0,-72.074333,41.586273,-72.022408,42.090183,1.0,10451.0,6.0,31.0,6.0,23.0,59.0,1.0,97.586212
std,0.498818,,,1.311365,0.068349,0.029024,0.067326,0.033225,0.074109,656.678504,1.680523,8.699308,1.95458,6.402445,17.332497,0.451453,3.93868


In [None]:
df["pickup_month"] = df["pickup_datetime"].dt.month
df["pickup_day"] = df["pickup_datetime"].dt.day
df["pickup_weekday"] = df["pickup_datetime"].dt.weekday
df["pickup_hour"] = df["pickup_datetime"].dt.hour
df["pickup_minute"] = df["pickup_datetime"].dt.minute
df["is_weekend"] = df["pickup_weekday"].isin([5, 6]).astype(int)


In [None]:
#This code calculates the straight-line distance between the pickup and dropoff points in kilometers using the Haversine formula, which accounts for the Earth’s curvature.
#It first converts latitude and longitude from degrees to radians, computes the differences, applies the trigonometric formula to find the central angle, and multiplies by the Earth’s radius (6371 km).
#The result is stored in a new DataFrame column distance_km for each trip.

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return 6371 * 2 * np.arcsin(np.sqrt(a))

df["distance_km"] = haversine(df["pickup_longitude"], df["pickup_latitude"],
                               df["dropoff_longitude"], df["dropoff_latitude"])


In [None]:
df["vendor_id"] = df["vendor_id"].astype("category").cat.codes
df["store_and_fwd_flag"] = df["store_and_fwd_flag"].map({"N": 0, "Y": 1}).fillna(0).astype(int)


In [None]:
df = df[(df["trip_duration"] > 0) & (df["distance_km"] > 0)]
df = df[df["trip_duration"] <= 10800]  # <= 3 hours
df = df[df["distance_km"] <= 100]


In [None]:
#df[(df["trip_duration"] > 0) & (df["distance_km"] > 0)] → Keeps only trips where duration and distance are both positive (removes zero/negative values, likely due to errors).

#df[df["trip_duration"] <= 10800] → Removes trips longer than 3 hours (10800 seconds), treating them as outliers.

#df[df["distance_km"] <= 100] → Removes trips longer than 100 km, also treating them as unrealistic outliers.

In [None]:
features = ["vendor_id", "passenger_count", "store_and_fwd_flag",
            "pickup_month", "pickup_day", "pickup_weekday", "pickup_hour", "pickup_minute",
            "is_weekend", "distance_km"]

X = df[features]
y = np.log1p(df["trip_duration"])  # log transform target


In [None]:
#Select features → Chooses specific columns as predictors for the model.

#Create X → Stores those predictor columns in a new DataFrame.

#Create y with log transform → Uses np.log1p() to transform trip_duration, reducing skewness and helping the model handle large values better.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)


In [None]:
#This code creates a Random Forest Regressor with 100 trees and trains it on the training data using all CPU cores for faster processing.

In [11]:
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # back-transform to seconds
y_test_sec = np.expm1(y_test)

r2 = r2_score(y_test_sec, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_sec, y_pred))

print("R² Score:", r2)
print("RMSE (seconds):", rmse)


R² Score: 0.713315568909054
RMSE (seconds): 352.14017961680673


In [None]:
#It predicts trip durations for the test set, converts them back from the log scale to seconds, then calculates and prints the model’s R² score (accuracy measure) and RMSE (average prediction error in seconds).