In [1]:
!python -V

Python 3.12.3


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer 
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

# Explore data

In [3]:
data_path = "~/notebooks/data/"
df = pd.read_parquet(data_path + "yellow_tripdata_2023-01.parquet")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [5]:
#df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
#df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
df.duration.std()

42.59435124195458

In [6]:
((df.duration >= 1) & (df.duration <= 60)).mean()

0.9812202822125979

# Read and transform training and validation datasets

In [7]:

def read_dataframe(filename):
    print(filename)

    df = pd.read_parquet(filename)
    
    #df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    #df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
    print(f"Duration std.dev.: {df.duration.std()}")
    
    # Most of the data (i.e. 96%) has a duration of 1 hour or less, so from a business point of view, it is worth focusing on predicting these cases and not the other 2%
    print(f"Percentage of data with a duration > 1 and < 60 minutes: {((df.duration >= 1) & (df.duration <= 60)).mean()}")
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # Define features to use
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    
    df[categorical] = df[categorical].astype(str) # converted to string to be able to use teh dictionary vectorizer

    return df

In [8]:
df_train = read_dataframe(data_path + "yellow_tripdata_2023-01.parquet")
df_val = read_dataframe(data_path + "yellow_tripdata_2023-02.parquet")

~/notebooks/data/yellow_tripdata_2023-01.parquet
Duration std.dev.: 42.59435124195458
Percentage of data with a duration > 1 and < 60 minutes: 0.9812202822125979
~/notebooks/data/yellow_tripdata_2023-02.parquet
Duration std.dev.: 42.84210176105113
Percentage of data with a duration > 1 and < 60 minutes: 0.9800944077722545


In [9]:
len(df_train), len(df_val)

(3009173, 2855951)

In [10]:
df_train.shape, df_val.shape 

((3009173, 20), (2855951, 20))

# Encode the data, first turn each of the rows into a dictionary and then apply a dictionary vectorizer to obtain the feature matrix 

In [11]:
# Define features to use
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer() # Vectorize a dictionary

train_dicts = df_train[categorical].to_dict(orient='records') 
X_train = dv.fit_transform(train_dicts) # this creates a feature matrix that can be used to train a model

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
X_train.shape

(3009173, 515)

In [13]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [14]:
# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Prediction
y_pred = lr.predict(X_train)

# Evaluation
np.sqrt(mean_squared_error(y_train, y_pred))

7.649261932106969

In [15]:
# Prediction
y_pred = lr.predict(X_val)

# Evaluation
np.sqrt(mean_squared_error(y_val, y_pred))

7.811818743246608