In [3]:
# Task description: https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/cohorts/2024/01-intro/homework.md

In [4]:
!pip install pandas sklearn pyarrow fastparquet



You should consider upgrading via the 'C:\Users\kiev-\IdeaProjects\untitled\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset for January 2023
green_dataset = "green_tripdata_2023-01.parquet"
yellow_dataset = "yellow_tripdata_2023-01.parquet"

df_train = pd.read_parquet(green_dataset)
df_val = pd.read_parquet(yellow_dataset)

In [6]:
# STep 1: Define a number of columns (EDA)
print(df_train.columns.tolist())
print(df_val.columns.tolist())
print(f"Number of columns in January taxi dataset #1: {df_train.shape[1]}")
print(f"Number of columns in January taxi dataset #2: {df_val.shape[1]}")


pickup_col = 'tpep_pickup_datetime' if 'tpep_pickup_datetime' in df_train.columns else 'lpep_pickup_datetime'
dropoff_col = 'tpep_dropoff_datetime' if 'tpep_dropoff_datetime' in df_train.columns else 'lpep_dropoff_datetime'

df_train[pickup_col] = pd.to_datetime(df_train[pickup_col])
df_train[dropoff_col] = pd.to_datetime(df_train[dropoff_col])

# Compute trip duration in minutes.
df_train['duration'] = (df_train[dropoff_col] - df_train[pickup_col]).dt.total_seconds() / 60

['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge']
['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'congestion_surcharge', 'airport_fee']
Number of columns in January taxi dataset #1: 20
Number of columns in January taxi dataset #2: 19


In [7]:

# Step 2. Compute the standard deviation of the trip duration.
std_duration = df_train['duration'].std()
print("Q2. Standard deviation of trip duration (in minutes):", round(std_duration, 2))
# Expected answer: approximately 42.59

Q2. Standard deviation of trip duration (in minutes): 74.93


In [8]:

# --- Step 3. Drop Outliers ---
# Keep only rides with a duration between 1 and 60 minutes (inclusive).
mask = (df_train['duration'] >= 1) & (df_train['duration'] <= 60)
df_train_filtered = df_train[mask].copy()

# Calculate the fraction of records remaining after dropping outliers.
fraction_remaining = len(df_train_filtered) / len(df_train)
print("Q3. Fraction of records left after dropping outliers:", f"{round(fraction_remaining * 100, 0)}%")
# Expected answer: ~95%

Q3. Fraction of records left after dropping outliers: 97.0%


In [9]:

# --- Step 4. One-Hot Encoding ---
# Convert pickup and dropoff location IDs to strings.
df_train_filtered['PULocationID'] = df_train_filtered['PULocationID'].astype(str)
df_train_filtered['DOLocationID'] = df_train_filtered['DOLocationID'].astype(str)

# Create a list of dictionaries from the two features.
dicts_train = df_train_filtered[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Initialize and fit a DictVectorizer.
dv = DictVectorizer()
X_train = dv.fit_transform(dicts_train)

# Q4. Get the dimensionality of the feature matrix.
print("Q4. Dimensionality of the one-hot encoded matrix:", X_train.shape[1])
# Expected answer: 515 columns


Q4. Dimensionality of the one-hot encoded matrix: 467


In [10]:
# --- Step 5. Training a Model ---
# Set the target variable.
y_train = df_train_filtered['duration'].values

# Train a linear regression model.
lr = LinearRegression()
lr.fit(X_train, y_train)

# Compute the RMSE on the training data.
y_pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("Q5. Training RMSE:", round(rmse_train, 2))


Q5. Training RMSE: 7.04


In [11]:
# --- Step 6. Evaluating the Model on Validation Data (February 2023) ---
# Process the validation dataset similarly.
pickup_col_val = 'tpep_pickup_datetime' if 'tpep_pickup_datetime' in df_val.columns else 'lpep_pickup_datetime'
dropoff_col_val = 'tpep_dropoff_datetime' if 'tpep_dropoff_datetime' in df_val.columns else 'lpep_dropoff_datetime'

df_val[pickup_col_val] = pd.to_datetime(df_val[pickup_col_val])
df_val[dropoff_col_val] = pd.to_datetime(df_val[dropoff_col_val])
df_val['duration'] = (df_val[dropoff_col_val] - df_val[pickup_col_val]).dt.total_seconds() / 60

# Drop outliers in the validation set.
mask_val = (df_val['duration'] >= 1) & (df_val['duration'] <= 60)
df_val_filtered = df_val[mask_val].copy()

# Convert location IDs to strings.
df_val_filtered['PULocationID'] = df_val_filtered['PULocationID'].astype(str)
df_val_filtered['DOLocationID'] = df_val_filtered['DOLocationID'].astype(str)

# Transform the validation data using the previously fitted DictVectorizer.
dicts_val = df_val_filtered[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(dicts_val)
y_val = df_val_filtered['duration'].values

# Compute predictions and RMSE on the validation dataset.
y_pred_val = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print("Q6. Validation RMSE:", round(rmse_val, 2))

Q6. Validation RMSE: 18.01
