In [1]:
import pickle
import pandas as pd
import warnings
import numpy as np
import os

In [2]:
!python --version
!pip freeze | grep scikit-learn
!pip freeze | grep pandas

Python 3.10.13
scikit-learn==1.4.2
pandas==2.2.2


In [3]:
#remove warnings
warnings.filterwarnings("ignore")

# Load the model and the vectorizer
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [4]:
# Define the features
categorical = ['PULocationID', 'DOLocationID']

# read the data
def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
year = str(2023).zfill(4)    
month = str(3).zfill(2)

# read the data
df = read_data(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet')

# Predict the data
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [6]:
print(f'Q1: The standard deviation for predicted duration is {np.std(y_pred)}')

Q1: The standard deviation for predicted duration is 6.247488852238703


In [7]:
df_result = pd.DataFrame()
df_result['predicted_duration'] = y_pred
df_result['ride_id'] = f'{year}/{month}_' + df.index.astype('str')

In [8]:
df_result.to_parquet(
    f'output_yellow_tripdata_{year}-{month}.parquet',
    engine='pyarrow',
    compression=None,
    index=False
)

In [9]:
file_size = (os.path.getsize(f'output_yellow_tripdata_{year}-{month}.parquet')) / 1024 / 1024
print(f"Q2: The size of the output file is {file_size:.2f} Megabytes")

Q2: The size of the output file is 65.46 Megabytes


In [10]:
print("Q3: To convert the notebook into a script I used the command \n'jupyter nbconvert --to script nazmul_homework_4.ipynb --output script'")

Q3: To convert the notebook into a script I used the command 
'jupyter nbconvert --to script nazmul_homework_4.ipynb --output script'


In [1]:
print("Q4: The first hash for Scikit-Learn is \n'sha256:1d0b25d9c651fd050555aadd57431b53d4cf664e749069da77f3d52c5ad14b3b'")

Q4: The first hash for Scikit-Learn is 
'sha256:1d0b25d9c651fd050555aadd57431b53d4cf664e749069da77f3d52c5ad14b3b'


In [2]:
!python script.py 2023 4

Q5: The mean predicted duration is 14.292282936862449


In [3]:
!docker run batch-docker-script:1.3 2023 5

Q6: The mean predicted duration is 0.19174419265916945
