In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2


In [2]:
!python -V

Python 3.8.15


In [4]:
import pickle
import pandas as pd
import numpy as np

In [6]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [8]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [9]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

# Q1. Notebook
What's the standard deviation of the predicted duration for this dataset?

In [11]:
std = np.std(y_pred)
print(f"The standard deviation is: {std: .2f}")

The standard deviation is:  6.25


# Q2. Preparing the output
What's the size of the output file?

In [14]:
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [16]:
df['predictions'] = y_pred

In [19]:
df_result = df[["ride_id", "predictions"]].reset_index(drop=True)

In [20]:
df_result.to_parquet(
    "prediction",
    engine='pyarrow',
    compression=None,
    index=False
)

In [21]:
import os

In [24]:
file_size = os.path.getsize("prediction")/(1e6)
print(f"Prediction filesize: {file_size: .2f}")

Prediction filesize:  68.32


# Q3. Creating the scoring script
Which command you need to execute for that?

```bash
jupyter nbconvert --to script notebook.ipynb

```

# Q4. Virtual environment
What's the first hash for the Scikit-Learn dependency?

In [25]:
hash = "sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c"

print(f"The hash is: {hash}")

The hash is: sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c


# Q5. Parametrize the script

In [26]:
with open("model.bin", "rb") as f_in:
    (dv, model) = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [35]:
categorical = ["PULocationID", "DOLocationID"]


def read_data(filename):
    df = pd.read_parquet(filename)

    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df["duration"] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype("int").astype("str")

    return df


def prepare_features(year, month):
    df = read_data(
        f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
    )
    dicts = df[categorical].iloc[0:10].to_dict(orient="records")
    features = dv.transform(dicts)
    return features


def predict(features):
    #X = dv.transform(features)
    preds = model.predict(features)
    return preds  # float(preds[0])

In [33]:
year = "2023"
month = "03"
print("HERE")
features = prepare_features(year, month)

HERE


In [36]:
predict(features)

array([16.24590642, 26.1347962 , 11.88426424, 11.99771983, 10.23448579,
       10.59717421, 12.44479314, 10.972209  , 23.12034345, 10.28111639])

In [37]:
year = "2023"
month = "03"
features = prepare_features(year, month)