In [4]:
import hashlib
import os

# Check if the file is actually changing
with open('model.bin', 'rb') as f:
    file_hash = hashlib.md5(f.read()).hexdigest()
    
print(f"Model file hash: {file_hash}")
print(f"File size: {os.path.getsize('model.bin')} bytes")
print(f"File modified: {os.path.getmtime('model.bin')}")

Model file hash: ed21c77dd95b293b0d7ef6682f498c81
File size: 17376 bytes
File modified: 1750063585.179188


In [5]:
import pickle
import warnings
import sklearn

print(f"sklearn version: {sklearn.__version__}")

# Capture warnings explicitly
with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")
    
    with open('model.bin', 'rb') as f:
        dv, model = pickle.load(f)
    
    if w:
        print(f"⚠️  Got {len(w)} warnings:")
        for warning in w:
            print(f"   {warning.message}")
    else:
        print("✅ No warnings!")

print("Model loaded")

sklearn version: 1.0.2
   Trying to unpickle estimator DictVectorizer from version 1.5.0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
   Trying to unpickle estimator LinearRegression from version 1.5.0 when using version 1.0.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
Model loaded


In [6]:
!pip freeze | grep scikit-learn

scikit-learn==1.0.2


In [7]:
!python -V

Python 3.9.12


In [8]:
import sklearn
import pandas as pd
import numpy as np
import pickle

print(f"sklearn version: {sklearn.__version__}")
print(f"pandas version: {pd.__version__}")

# Load model
with open('model.bin', 'rb') as f:
    dv, model = pickle.load(f)
print("Model loaded successfully")

sklearn version: 1.0.2
pandas version: 1.3.5
Model loaded successfully


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [4]:
!pip list

Package                   Version
------------------------- -----------
anyio                     4.7.0
argon2-cffi               21.3.0
argon2-cffi-bindings      21.2.0
asttokens                 3.0.0
async-lru                 2.0.4
attrs                     24.3.0
babel                     2.16.0
backcall                  0.2.0
beautifulsoup4            4.12.3
bleach                    6.2.0
Bottleneck                1.4.2
Brotli                    1.0.9
certifi                   2025.4.26
cffi                      1.17.1
charset-normalizer        3.3.2
comm                      0.2.1
debugpy                   1.8.11
decorator                 5.1.1
defusedxml                0.7.1
exceptiongroup            1.2.0
executing                 0.8.3
fastjsonschema            2.20.0
h11                       0.16.0
httpcore                  1.0.9
httpx                     0.28.1
idna                      3.7
importlib_metadata        8.5.0
ipykernel                 6.29.5
ipython            

In [9]:
import numpy as np

In [6]:
import pickle
import pandas as pd

In [5]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [9]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [10]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [11]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [13]:
print(f"dataset shape: {df.shape}")
print(f"first few column names: {list(df.columns[:5])}")

dataset shape: (3316216, 20)
first few column names: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance']


In [14]:
print("Duration column we created:")
print(df['duration'].describe())

Duration column we created:
count    3.316216e+06
mean     1.499996e+01
std      1.060465e+01
min      1.000000e+00
25%      7.483333e+00
50%      1.211667e+01
75%      1.930000e+01
max      6.000000e+01
Name: duration, dtype: float64


In [11]:
print("sample trips:")
print(df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'duration']].head(3))

sample trips:
  tpep_pickup_datetime tpep_dropoff_datetime   duration
0  2023-03-01 00:06:43   2023-03-01 00:16:43  10.000000
1  2023-03-01 00:08:25   2023-03-01 00:39:30  31.083333
2  2023-03-01 00:15:04   2023-03-01 00:29:26  14.366667


In [12]:
print("where these trips went")
print(df[['PULocationID', 'DOLocationID']].head(3))

where these trips went
  PULocationID DOLocationID
0          238           42
1          138          231
2          140          186


In [13]:
print("what your model predicted for those same 3 trips")
print(y_pred[:3])
print("\nactual vs predicted:")
for i in range(3):
  print(f"trip {i+1}: actual {df['duration'].iloc[i]:.1f} min, predicted{y_pred[i]:.1f} min")

what your model predicted for those same 3 trips
[16.24590642 26.1347962  11.88426424]

actual vs predicted:
trip 1: actual 10.0 min, predicted16.2 min
trip 2: actual 31.1 min, predicted26.1 min
trip 3: actual 14.4 min, predicted11.9 min


### question 1

In [16]:
# Run the Q1 calculation
std_deviation = np.std(y_pred)
print(f"Standard deviation: {std_deviation}")

Standard deviation: 6.247488852238703


In [12]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
categorical = ['PULocationID', 'DOLocationID']
val_dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = model.predict(X_val)

print(f"Standard deviation: {np.std(y_pred)}")

Standard deviation: 6.247488852238703


### question 2

In [13]:
# Q2: Create output file
year = 2023
month = 3

# Create ride_id column
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

# Create results dataframe
df_result = pd.DataFrame({
    'ride_id': df['ride_id'],
    'predicted_duration': y_pred
})

# Save as parquet
output_file = 'predictions.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

# Check file size
import os
file_size_mb = os.path.getsize(output_file) / (1024*1024)
print(f"File size: {file_size_mb:.1f}M")

File size: 65.5M


In [None]:
import sys
print("Python executable:", sys.executable)
print("Should contain: 04_-_deployment-SeBN4jeX")

Python executable: /home/codespace/.local/share/virtualenvs/04_-_deployment-SeBN4jeX/bin/python
Should contain: 04_-_deployment-SeBN4jeX


: 