In [3]:
conda list | findstr scikit-learn


scikit-learn              1.0.2            py39hf11a4ad_1  
scikit-learn-intelex      2021.6.0         py39haa95532_0  

Note: you may need to restart the kernel to use updated packages.


In [4]:
!python -V

Python 3.9.13


In [42]:
import pickle
import pandas as pd
import pyarrow
import os

In [9]:

with open('model.pkl', 'rb') as f_in:
    dv, lr = pickle.load(f_in)


In [10]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [12]:
# Load the data
df = pd.read_parquet('yellow_tripdata_2023-03 (1).parquet')

In [14]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [15]:
y_pred

array([23.2066124, 23.2066124, 23.2066124, ..., 23.2066124, 23.2066124,
       23.2066124])

In [16]:
# Calculate the standard deviation of the predicted durations
std_pred = y_pred.std()

In [17]:
std_pred

1.3500311979441904e-13

In [20]:
# Assuming you have the year and month variables defined
year = 2023
month = 3


In [30]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,ride_id,pickup_datetime,dropoff_datetime,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,...,0.0,0.0,1.0,11.1,0.0,0.0,2023/03_0,2023-03-01 00:06:43,2023-03-01 00:16:43,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,...,12.54,0.0,1.0,76.49,2.5,1.25,2023/03_1,2023-03-01 00:08:25,2023-03-01 00:39:30,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,...,4.65,0.0,1.0,28.05,2.5,0.0,2023/03_2,2023-03-01 00:15:04,2023-03-01 00:29:26,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,...,4.1,0.0,1.0,24.7,2.5,0.0,2023/03_3,2023-03-01 00:49:37,2023-03-01 01:01:05,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,...,2.44,0.0,1.0,14.64,2.5,0.0,2023/03_4,2023-03-01 00:08:04,2023-03-01 00:11:06,3.033333


In [21]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [25]:
# Example preprocessing (to be adjusted based on the actual data and initial notebook)
df['pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds() / 60


In [26]:
# Filter out any potential outliers or invalid data as done in the initial notebook
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]


In [31]:
# Select the relevant categorical columns for transformation
categorical = ['PULocationID', 'DOLocationID']

In [32]:
# Convert the categorical columns to a list of dictionaries
dicts = df[categorical].fillna(-1).astype(int).to_dict(orient='records')


In [33]:
# Transform the data
X_val = dv.transform(dicts)

In [35]:
# Make predictions
y_pred = lr.predict(X_val)

In [36]:
# Create a results DataFrame
df_result = pd.DataFrame({
    'ride_id': df['ride_id'],
    'predicted_duration': y_pred
})


In [37]:
# Define the output file name
output_file = 'predictions.parquet'


In [38]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [43]:
# Check the size of the output file
output_file_size = os.path.getsize(output_file)

print(f'Size of the output file: {output_file_size} bytes')

Size of the output file: 62387398 bytes
