# Homework04

In [1]:
import pickle
import pandas as pd
import warnings
import os

warnings.filterwarnings("ignore")

In [2]:
with open("model.bin", "rb") as f_in:
    dv, model = pickle.load(f_in)

In [3]:
categorical = ["PULocationID", "DOLocationID"]


def read_data(filename):
    df = pd.read_parquet(filename)

    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype("int").astype("str")

    return df


df = read_data(
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet"
)

dicts = df[categorical].to_dict(orient="records")
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## Q1. Notebook What's the standard deviation of the predicted duration for this dataset?

In [24]:
print("The stardard deviation of the predictions is: ", round(y_pred.std(), 3))

The stardard deviation of the predictions is:  6.247


## Q2 Preparing the output

In [25]:
df["ride_id"] = (
    df["tpep_pickup_datetime"].dt.year.astype(str).str.zfill(4)
    + "/"
    + df["tpep_pickup_datetime"].dt.month.astype(str).str.zfill(2)
    + "_"
    + df.index.astype(str)
)

df_result = df[["ride_id"]].copy()
df_result["duration"] = y_pred

os.makedirs("output", exist_ok=True)
output_file = "output/result.parquet"
df_result.to_parquet(output_file, engine="pyarrow", compression=None, index=False)

In [26]:
size_bytes = os.path.getsize(output_file)
size_mb = size_bytes / (1024 * 1024)

print(f"The output file size: {size_bytes:,} bytes ({size_mb:.2f} MB)")

The output file size: 68,641,739 bytes (65.46 MB)


## Q3 Creating the scoring script

In [27]:
!jupyter nbconvert --to script homework04.ipynb --output scoring

[NbConvertApp] Converting notebook homework04.ipynb to script
[NbConvertApp] Writing 6331 bytes to scoring.py


In [28]:
!pwd

/workspaces/zoomcamp-mlops-mukesh-k-debata/04-deployment


## Q4. Virtual environment

In [29]:
# Find the hash of scikit-learn in the uv.lock file
# using uv has my environment

with open("../uv.lock", "r") as f:
    lines = f.readlines()

scikit_learn_found = False
for i, line in enumerate(lines):
    if "scikit-learn" in line:
        scikit_learn_found = True
        # Look for hash in nearby lines
        for j in range(i, min(i + 10, len(lines))):
            if "hash" in lines[j]:
                print(lines[j].strip())
                break
        if scikit_learn_found:
            break

sdist = { url = "https://files.pythonhosted.org/packages/2c/22/a2d2fa7d142959720a3c5b5ffcd914be9fe02e299734ddd21c293a797981/mlflow-2.22.0.tar.gz", hash = "sha256:5b8780a8407c1b2ad441a13054f33ed0a0a58df4a69ddada2e30321d287e4f87", size = 28375641, upload-time = "2025-04-24T09:11:44.059Z" }


## Q5 Parametrize the script

In [6]:
!python src/scoring.py 2023 04

The mean predicted duration is 14.292 minutes


## Q6 Docker container

In [7]:
# Dockerfile with uv
with open("Dockerfile", "r") as f:
    dockerfile_content = f.read()
print("\nDockerfile content:\n")
print(dockerfile_content)


Dockerfile content:

FROM agrigorev/zoomcamp-model:mlops-2024-3.10.13-slim

# Install uv as the Python package manager
RUN pip install -U pip
RUN pip install uv

WORKDIR /app

COPY ["requirements.txt", "./"]

# Use uv to install dependencies with --system flag for global installation
RUN uv pip install --system -r requirements.txt

COPY ["src/scoring.py", "./"]

RUN mkdir -p output

CMD ["python", "scoring.py", "2023", "03"]


In [8]:
# Docker image with the fixed Dockerfile
!docker build -t taxi-duration-prediction .

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                          docker:default
[?25h[1A[0G[?25l[+] Building 0.2s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 446B                                       0.0s
[0m => [internal] load metadata for docker.io/agrigorev/zoomcamp-model:mlops  0.2s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 446B                                       0.0s
[0m => [internal] load metadata for docker.io/agrigorev/zoomcamp-model:mlops  0.3s
[?25h[1A[1A[1A[1A[0G[?25l[+] Building 0.5s (1/2)                                          docker:default
[34m => [internal] load build definition from Dockerfile     

In [9]:
# Run the container
!docker run --rm taxi-duration-prediction

The mean predicted duration is 0.189 minutes
