In [3]:
# Task description: https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/cohorts/2024/01-intro/homework.md

In [6]:
!pip install pandas scikit-learn pyarrow fastparquet

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/31/9e/6ebb433de864a6cd45716af52a4d7a8c3c9aaf3a98368e61db9e69e69a9c/pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/17/04/d5d556b6c88886c092cc989433b2bab62488e0f0dafe616a1d5c9cb0efb1/scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting pyarrow
  Obtaining dependency information for pyarrow from https://files.pythonhosted.org/packages/54/e3/d5cfd7654084e6c0d9c3ce949e5d9e0ccad569ae1e2d5a68a3ec03b2be89/pyarrow-19.0.1-cp310-cp310-win_amd64.whl.metadata
  Downloading pyarrow-19.0.1-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting fastparquet
  Obtaining dependency information for fastparquet from https:/


[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

train_file = "yellow_tripdata_2023-01.parquet"
val_file = "yellow_tripdata_2023-02.parquet"
df_train = pd.read_parquet(train_file)
df_val = pd.read_parquet(val_file)

In [28]:
# Q1: Number of columns
print(f"Q1: Number of columns in January dataset: {df_train.shape[1]}") 

Q1: Number of columns in January dataset: 19


In [29]:
# Q2: Duration computation
df_train['duration'] = (
    pd.to_datetime(df_train['tpep_dropoff_datetime']) -
    pd.to_datetime(df_train['tpep_pickup_datetime'])
).dt.total_seconds() / 60
std_duration = df_train['duration'].std()
print(f"Q2: Std. deviation of duration: {std_duration:.2f}")

Q2: Std. deviation of duration: 42.59


In [30]:
# Q3: Dropping outliers (duration between 1 and 60)
initial_count = df_train.shape[0]
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
filtered_count = df_train.shape[0]
fraction_left = filtered_count / initial_count * 100
print(f"Q3: Fraction of records left after dropping outliers: {fraction_left:.0f}%") 

Q3: Fraction of records left after dropping outliers: 98%


In [31]:
# Q4: One-hot encoding pickup/dropoff IDs
df_train['PULocationID'] = df_train['PULocationID'].astype(str)
df_train['DOLocationID'] = df_train['DOLocationID'].astype(str)

dv = DictVectorizer()
train_dicts = df_train[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
print(f"Q4: Dimensionality of feature matrix: {X_train.shape[1]}") 

Q4: Dimensionality of feature matrix: 515


In [32]:
# Q5: Training linear regression
y_train = df_train.duration.values
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"Q5: RMSE on training data: {rmse_train:.2f}")

Q5: RMSE on training data: 7.65


In [33]:
# Q6: Validation set preparation and evaluation
df_val['duration'] = (
    pd.to_datetime(df_val['tpep_dropoff_datetime']) -
    pd.to_datetime(df_val['tpep_pickup_datetime'])
).dt.total_seconds() / 60

# Validation outlier filtering
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

# Validation feature encoding
df_val['PULocationID'] = df_val['PULocationID'].astype(str)
df_val['DOLocationID'] = df_val['DOLocationID'].astype(str)
val_dicts = df_val[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val.duration.values

# Validation prediction
y_pred_val = lr.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f"Q6: RMSE on validation data: {rmse_val:.2f}")

Q6: RMSE on validation data: 7.81


In [None]:
# homework check https://courses.datatalks.club/mlops-zoomcamp-2024/homework/hw1