In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.0.2


In [1]:
import os

import joblib
import pandas as pd

In [2]:
MONTH = 2
YEAR = 2021

MODEL_FILENAME = 'model.bin'
model_file_path = os.path.join(os.getcwd(), MODEL_FILENAME)
input_file_url = f'https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_{YEAR:04d}-{MONTH:02d}.parquet'
ouptput_file_path = os.path.join(os.getcwd(), f'fhv_tripdata_predictions_{YEAR:04d}-{MONTH:02d}.parquet')

In [3]:
dv, lr = joblib.load(model_file_path)

In [4]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds() / 60

    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df = read_data(input_file_url)

In [6]:
df.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                      object
DOlocationID                      object
SR_Flag                           object
Affiliated_base_number            object
duration                         float64
dtype: object

In [7]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [8]:
y_pred.mean()

16.191691679979066

In [9]:
df['ride_id'] = f'{YEAR:04d}/{MONTH:02d}_' + df.index.astype('str')

In [10]:
df_result = df[['ride_id']]
df_result['predicted_duration'] = y_pred
df_result.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_result['predicted_duration'] = y_pred


Unnamed: 0,ride_id,predicted_duration
1,2021/02_1,14.539865
2,2021/02_2,13.740422
3,2021/02_3,15.593339
4,2021/02_4,15.188118
5,2021/02_5,13.817206


In [11]:
df_result.to_parquet(
    ouptput_file_path,
    engine='pyarrow',
    compression=None,
    index=False
)

In [12]:
file_size = os.path.getsize(ouptput_file_path)
print(f"File size is {round(file_size/(1024*1024))} Mbytes")

File size is 19 Mbytes
