In [6]:
!pip freeze | grep scikit-learn

scikit-learn==1.0.2


In [7]:
!python -V

Python 3.9.12


In [8]:
import sklearn
import pandas as pd
import numpy as np
import pickle

print(f"sklearn version: {sklearn.__version__}")
print(f"pandas version: {pd.__version__}")

# Load model
with open('model.bin', 'rb') as f:
    dv, model = pickle.load(f)
print("Model loaded successfully")

sklearn version: 1.0.2
pandas version: 1.3.5
Model loaded successfully


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [9]:
import numpy as np

In [6]:
import pickle
import pandas as pd

In [5]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [9]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [10]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [11]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [13]:
print(f"dataset shape: {df.shape}")
print(f"first few column names: {list(df.columns[:5])}")

dataset shape: (3316216, 20)
first few column names: ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance']


In [14]:
print("Duration column we created:")
print(df['duration'].describe())

Duration column we created:
count    3.316216e+06
mean     1.499996e+01
std      1.060465e+01
min      1.000000e+00
25%      7.483333e+00
50%      1.211667e+01
75%      1.930000e+01
max      6.000000e+01
Name: duration, dtype: float64


In [11]:
print("sample trips:")
print(df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'duration']].head(3))

sample trips:
  tpep_pickup_datetime tpep_dropoff_datetime   duration
0  2023-03-01 00:06:43   2023-03-01 00:16:43  10.000000
1  2023-03-01 00:08:25   2023-03-01 00:39:30  31.083333
2  2023-03-01 00:15:04   2023-03-01 00:29:26  14.366667


In [12]:
print("where these trips went")
print(df[['PULocationID', 'DOLocationID']].head(3))

where these trips went
  PULocationID DOLocationID
0          238           42
1          138          231
2          140          186


### question 1

In [12]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
categorical = ['PULocationID', 'DOLocationID']
val_dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_pred = model.predict(X_val)

print(f"Standard deviation: {np.std(y_pred)}")

Standard deviation: 6.247488852238703


### question 2

In [None]:
year = 2023
month = 3

df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

df_result = pd.DataFrame({
    'ride_id': df['ride_id'],
    'predicted_duration': y_pred
})

output_file = 'predictions.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

import os
file_size_mb = os.path.getsize(output_file) / (1024*1024)
print(f"File size: {file_size_mb:.1f}M")

File size: 65.5M
