In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [2]:
!python -V

Python 3.9.19


In [3]:
import pickle
import pandas as pd

import numpy as np

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [5]:
dv, model

(DictVectorizer(), LinearRegression())

In [6]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [7]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')

In [8]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333


In [9]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [10]:
y_pred[4000:4025]

array([11.10128329, 11.12220712, 12.65958822, 31.53770023, 10.68942158,
       10.77253784, 31.00112885, 10.9931043 , 17.35020465, 30.57045854,
       11.16535326, 14.99184997, 11.29992506, 17.3162062 , 13.74752991,
       10.23927823,  9.53556128, 12.21804649, 11.19638912, 13.1229674 ,
       11.21889704, 11.50245502, 11.00548409, 10.30227075, 30.73625332])

In [11]:
y = df['duration']
print('Standard Deviation', np.std(y_pred))

Standard Deviation 6.247488852238703


In [12]:
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [15]:
# Convert y_pred to a pandas Series
y_pred_series = pd.Series(y_pred, name='predicted_duration')

# Concatenate df['ride_id'] with y_pred_series
df_result = pd.concat([df['ride_id'], y_pred_series], axis=1)

# Display the result
print(df_result)

           ride_id  predicted_duration
0        2023/03_0           16.245906
1        2023/03_1           26.134796
2        2023/03_2           11.884264
3        2023/03_3           11.997720
4        2023/03_4           10.234486
...            ...                 ...
3315919        NaN           10.423729
3315968        NaN           10.683414
3316010        NaN           13.251093
3316013        NaN            9.507458
3316181        NaN           12.740809

[3400969 rows x 2 columns]


In [16]:
output_file = "output.parquet"
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)