In [68]:
import pandas as pd

In [95]:
def df_features(data, to_encode):
    df = pd.read_parquet(data)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[to_encode] = df[to_encode].astype(str)
    return df[to_encode], df.duration


In [69]:
df = pd.read_parquet('data/yellow_tripdata_2023-01.parquet')

In [70]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

Q1: Downloading the data  
19 Columns

In [72]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration[:5]

0   0 days 00:08:26
1   0 days 00:06:19
2   0 days 00:12:45
3   0 days 00:09:37
4   0 days 00:10:50
Name: duration, dtype: timedelta64[ns]

In [73]:
df['duration'] = df['duration'].apply(lambda x: x.total_seconds()/60)

In [74]:
df.duration.std()

42.59435124195458

Q2. Computing duration:  
stdev 42.59435124195458

In [75]:
df.duration.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3066766 entries, 0 to 3066765
Series name: duration
Non-Null Count    Dtype  
--------------    -----  
3066766 non-null  float64
dtypes: float64(1)
memory usage: 23.4 MB


In [76]:
df.duration.dtype

dtype('float64')

In [77]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [78]:
df.duration.info()

<class 'pandas.core.series.Series'>
Int64Index: 3009173 entries, 0 to 3066765
Series name: duration
Non-Null Count    Dtype  
--------------    -----  
3009173 non-null  float64
dtypes: float64(1)
memory usage: 45.9 MB


In [79]:
round((3009173 / 3066766), 2)

0.98

Q3. Dropping outliers:  
98%

In [None]:
# Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

# Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)

to_encode = ['PULocationID', 'DOLocationID']
df[to_encode] = df[to_encode].astype(str)
dicts = df[to_encode].to_dict(orient='records')

In [None]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

In [57]:
# Fit a dictionary vectorizer
# Get a feature matrix from it

X = dv.fit_transform(dicts)

In [58]:
X.shape

(3009173, 515)

Q4. One-hot encoding:  
Columns: 515

Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters, where duration is the response variable
Calculate the RMSE of the model on the training data


In [63]:
from sklearn.linear_model import LinearRegression

In [65]:
lr_model = LinearRegression()

In [80]:
y = df.duration

In [81]:
lr_model.fit(X, y)

In [82]:
from sklearn.metrics import mean_squared_error

In [83]:
y_pred = lr_model.predict(X)

In [84]:
mean_squared_error(y, y_pred, squared=False)

7.649262776520412

Q5. Training a model:  
RMSE 7.649262776520412

In [96]:
# Now let's apply this model to the validation dataset (February 2023).
# What's the RMSE on validation?

df_val, y_val = df_features('data/yellow_tripdata_2023-02.parquet', to_encode)
df_val.head(2)

Unnamed: 0,PULocationID,DOLocationID
0,142,163
3,132,26


In [99]:
dicts_val = df_val.to_dict(orient='records')
X_val = dv.transform(dicts_val)
X_val

<2855951x515 sparse matrix of type '<class 'numpy.float64'>'
	with 5711894 stored elements in Compressed Sparse Row format>

In [97]:
y_val

0           1.683333
3          32.083333
4          13.300000
5          14.633333
6          27.950000
             ...    
2913950    19.000000
2913951    11.133333
2913952    14.000000
2913953     7.000000
2913954     9.800000
Name: duration, Length: 2855951, dtype: float64

In [100]:
y_val_pred = lr_model.predict(X_val)

In [102]:
mean_squared_error(y_val_pred, y_val, squared=False)

7.811802649882949

Q6. Evaluating the model:  
RMSE: 7.811802649882949