In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [2]:
!python -V

Python 3.10.13


In [3]:
import pickle
import pandas as pd
import numpy as np

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [5]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.00,1.0,N,238,42,2,8.60,1.0,0.5,0.00,0.0,1.0,11.10,0.0,0.00,10.000000
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.40,1.0,N,138,231,1,52.70,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.30,1.0,N,140,186,1,18.40,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.00,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.90,1.0,N,140,43,1,15.60,3.5,0.5,4.10,0.0,1.0,24.70,2.5,0.00,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.20,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.00,3.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3403761,2,2023-03-31 23:24:25,2023-03-31 23:40:54,,3.16,,,163,75,0,12.13,0.0,0.5,4.23,0.0,1.0,20.36,,,16.483333
3403762,2,2023-03-31 23:24:50,2023-04-01 00:04:12,,6.89,,,125,198,0,40.92,0.0,0.5,8.98,0.0,1.0,53.90,,,39.366667
3403763,2,2023-03-31 23:26:31,2023-03-31 23:49:39,,4.01,,,50,224,0,24.02,0.0,0.5,0.00,0.0,1.0,28.02,,,23.133333
3403764,2,2023-03-31 23:07:51,2023-03-31 23:15:56,,1.31,,,113,158,0,8.51,0.0,0.5,3.50,0.0,1.0,16.01,,,8.083333


In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

In [8]:
np.std(y_pred)

6.247503142838234

In [9]:
y_pred

array([16.24572173, 26.13494936, 11.88431659, ..., 11.5953892 ,
       13.11320548, 12.90068731])

In [11]:
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [12]:
df_result = pd.DataFrame()
df_result['predictions'] = y_pred.tolist()
df_result['ride_id'] = df['ride_id'].tolist()
df_result

Unnamed: 0,predictions,ride_id
0,16.245722,2023/03_0
1,26.134949,2023/03_1
2,11.884317,2023/03_2
3,11.997753,2023/03_3
4,10.234506,2023/03_4
...,...,...
3316211,11.952613,2023/03_3403761
3316212,20.045291,2023/03_3403762
3316213,11.595389,2023/03_3403763
3316214,13.113205,2023/03_3403764


In [13]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-16.1.0


In [15]:
df_result.to_parquet(
    'df_result.parquet',
    engine='pyarrow',
    compression=None,
    index=False
)