In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.1.1


In [2]:
import pickle
import pandas as pd
import numpy as np
import datetime

In [3]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [5]:
df=read_data('./data/fhv_tripdata_2021-02.parquet')

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [7]:
print(y_pred.mean())

16.191691679979066


In [8]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173,82,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173,56,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82,129,,B00021,7.950000
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1,225,,B00037,13.800000
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1,61,,B00037,8.966667
...,...,...,...,...,...,...,...,...
1037687,B03282,2021-02-28 23:01:16,2021-02-28 23:14:48,-1,31,,B01717,13.533333
1037688,B03282,2021-02-28 23:36:10,2021-02-28 23:47:38,-1,169,,B01717,11.466667
1037689,B03285,2021-02-28 23:18:36,2021-02-28 23:43:59,28,171,,B03285,25.383333
1037690,B03285,2021-02-28 23:26:34,2021-02-28 23:44:37,16,252,,B03285,18.050000


In [9]:
year = pd.DatetimeIndex(df['pickup_datetime']).year
month = pd.DatetimeIndex(df['pickup_datetime']).month

year = year.astype(str)
month = month.astype(str)

# df['ride_id'] = f'2021/02_' + df.index.astype('str')

In [10]:
df["ride_id"]=df.apply(lambda x: f'{x.pickup_datetime.year}/{x.pickup_datetime.month}_'+str(x.name),axis=1)

In [11]:
df["pred"]=y_pred

In [12]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration,ride_id,pred
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173,82,,B00021,10.666667,2021/2_1,14.539865
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173,56,,B00021,14.566667,2021/2_2,13.740422
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82,129,,B00021,7.950000,2021/2_3,15.593339
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,-1,225,,B00037,13.800000,2021/2_4,15.188118
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,-1,61,,B00037,8.966667,2021/2_5,13.817206
...,...,...,...,...,...,...,...,...,...,...
1037687,B03282,2021-02-28 23:01:16,2021-02-28 23:14:48,-1,31,,B01717,13.533333,2021/2_1037687,12.433246
1037688,B03282,2021-02-28 23:36:10,2021-02-28 23:47:38,-1,169,,B01717,11.466667,2021/2_1037688,12.910885
1037689,B03285,2021-02-28 23:18:36,2021-02-28 23:43:59,28,171,,B03285,25.383333,2021/2_1037689,15.834923
1037690,B03285,2021-02-28 23:26:34,2021-02-28 23:44:37,16,252,,B03285,18.050000,2021/2_1037690,16.783176


In [13]:
df[["ride_id","pred"]].to_parquet(
    'output_file.parquet',
    engine='pyarrow',
    compression=None,
    index=False
)

In [14]:
! jupyter nbconvert --to python starter.ipynb

[NbConvertApp] Converting notebook starter.ipynb to python
[NbConvertApp] Writing 2032 bytes to starter.py


In [15]:
! pip install pipenv

[0m

Hashes

"scikit-learn": {
            "hashes": [
                "sha256:0403ad13f283e27d43b0ad875f187ec7f5d964903d92d1ed06c51439560ecea0",
                "sha256:102f51797cd8944bf44a038d106848ddf2804f2c1edf7aea45fba81a4fdc4d80",
                "sha256:22145b60fef02e597a8e7f061ebc7c51739215f11ce7fcd2ca9af22c31aa9f86",
                "sha256:33cf061ed0b79d647a3e4c3f6c52c412172836718a7cd4d11c1318d083300133",