In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# 1) Downloading the data 

In [2]:
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [3]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [4]:
print(f"→ January dataset has {df_train.shape[1]} columns.")

→ January dataset has 19 columns.


# 2) Computing duration

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [6]:
def compute_duration(df):
    """
    Add a 'duration' feature (in minutes) to the dataframe.
    """
    return df.assign(
        duration = (
            (
                    pd.to_datetime(df['tpep_dropoff_datetime'])
                    - pd.to_datetime(df['tpep_pickup_datetime'])
            ).dt.total_seconds()
            / 60.0
            )
    )

In [7]:
df_train = compute_duration(df_train)
df_val = compute_duration(df_val)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [9]:
print("→ The standard deviation of the trips duration in January is",
      df_train['duration'].describe()['std'].round(2),
      "minutes."
     )

→ The standard deviation of the trips duration in January is 42.59 minutes.


# 3) Dropping outliers

In [10]:
def drop_outliers(
    df: pd.DataFrame
) -> tuple[pd.DataFrame, int, int]:
    """
    Remove rows with outlier durations (below 1 min. and above 60 min.) from the dataframe 'df'.

    Returns: a tuple with 3 values,
        − the processed dataframe;
        − the number of records before processing;
        − the number of records after processing.
    """

    mask = (df.duration >= 1) & (df.duration <= 60)

    n_rows_before = len(df)
    df = df.loc[mask]
    n_rows_after = len(df)

    return (df, n_rows_before, n_rows_after)

In [11]:
df_train, n_before, n_after = drop_outliers(df_train)
df_val = drop_outliers(df_val)[0]

In [12]:
print(
    "→ After dropping outliers, the fraction of records left is "
    f"{round(n_after * 100.0 / n_before)}%."
)

→ After dropping outliers, the fraction of records left is 98%.


# 4) One-hot encoding

In [13]:
features = ['PULocationID', 'DOLocationID']
target = ['duration']

In [14]:
dv = DictVectorizer()
X_train = dv.fit_transform(
    df_train[features].to_dict(orient='records')
)

In [15]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [16]:
X_train.toarray()

array([[141., 161.],
       [237.,  43.],
       [238.,  48.],
       ...,
       [239., 114.],
       [ 79., 230.],
       [143., 262.]])

In [17]:
print(f"→ Dimensionality of the X_train OHE matrix: {X_train.ndim}")

→ Dimensionality of the X_train OHE matrix: 2


In [18]:
X_val = dv.transform(
    df_val[features].to_dict(orient='records')
)

# 5) Model training

In [19]:
y_train = df_train[target]
y_val = df_val[target]

In [20]:
y_train.shape

(3009173, 1)

In [21]:
# Train from test dataset
lr = LinearRegression()
lr.fit(X_train, y_train)

In [22]:
# Predict from train dataset
y_pred = lr.predict(X_train)

In [23]:
# Use RMSE metric
print(f"→ RMSE on train set = {round(root_mean_squared_error(y_train, y_pred), 2)} minutes.")

→ RMSE on train set = 9.84 minutes.


# 6) Evaluate the model

In [24]:
y_pred = lr.predict(X_val)
print(f"→ RMSE on validation set = {round(root_mean_squared_error(y_val, y_pred), 2)} minutes.")

→ RMSE on validation set = 9.96 minutes.
