In [44]:
import polars as pl
import numpy as np
import pathlib
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
data_path = pathlib.Path("/home/max509/Documents/education/mlops-zoomcamp/data")
yellow_tripdata_jan_df = pl.read_parquet(data_path / "yellow_tripdata_2023-01.parquet")
yellow_tripdata_feb_df = pl.read_parquet(data_path / "yellow_tripdata_2023-02.parquet")

In [3]:
yellow_tripdata_jan_df

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,"""N""",161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,"""N""",43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,"""N""",48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,"""N""",138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,"""N""",107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.8,0.0,0.5,3.96,0.0,1.0,23.76,,
2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.8,,,112,75,0,22.43,0.0,0.5,2.64,0.0,1.0,29.07,,
2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.0,0.5,5.32,0.0,1.0,26.93,,
2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.0,0.5,4.43,0.0,1.0,26.58,,


In [8]:
yellow_tripdata_feb_df = yellow_tripdata_feb_df.rename({"Airport_fee": "airport_fee"})

In [9]:
yellow_tripdata_df = pl.concat(
    (yellow_tripdata_jan_df, yellow_tripdata_feb_df), how="vertical_relaxed"
)
yellow_tripdata_df

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
i64,datetime[ns],datetime[ns],f64,f64,f64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,"""N""",161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,"""N""",43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,"""N""",48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,"""N""",138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,"""N""",107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.0,0.5,4.84,0.0,1.0,29.06,,
2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.0,0.5,2.65,0.0,1.0,20.31,,
2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.0,0.5,0.0,0.0,1.0,21.64,,
2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.0,0.5,2.63,0.0,1.0,20.19,,


In [12]:
duration_jan = (
    yellow_tripdata_jan_df["tpep_dropoff_datetime"]
    - yellow_tripdata_jan_df["tpep_pickup_datetime"]
)
duration_jan_minutes = duration_jan.map_elements(
    lambda v: v.total_seconds() / 60, return_dtype=pl.Float64
)
duration_jan_minutes

tpep_dropoff_datetime
f64
8.433333
6.316667
12.75
9.616667
10.833333
…
13.983333
19.45
24.516667
13.0


In [13]:
duration_jan_minutes_std = np.std(duration_jan_minutes.to_numpy())
duration_jan_minutes_std

42.59434429744777

In [38]:
duration_jan_minutes_not_outliers_mask = (duration_jan_minutes >= 1) & (
    duration_jan_minutes <= 60
)
duration_jan_minutes_not_outliers = duration_jan_minutes.filter(
    duration_jan_minutes_not_outliers_mask
)
duration_jan_minutes_not_outliers

tpep_dropoff_datetime
f64
8.433333
6.316667
12.75
9.616667
10.833333
…
13.983333
19.45
24.516667
13.0


In [18]:
len(duration_jan_minutes_not_outliers) / len(duration_jan_minutes)

0.9812202822125979

In [39]:
yellow_tripdata_jan_ids = [
    {"PULocationID": str(row[0]), "DOLocationID": str(row[1])}
    for row in yellow_tripdata_jan_df.filter(duration_jan_minutes_not_outliers_mask)[
        ["PULocationID", "DOLocationID"]
    ].iter_rows()
]
yellow_tripdata_jan_ids

[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'},
 {'PULocationID': '161', 'DOLocationID': '137'},
 {'PULocationID': '239', 'DOLocationID': '143'},
 {'PULocationID': '142', 'DOLocationID': '200'},
 {'PULocationID': '164', 'DOLocationID': '236'},
 {'PULocationID': '141', 'DOLocationID': '107'},
 {'PULocationID': '234', 'DOLocationID': '68'},
 {'PULocationID': '79', 'DOLocationID': '264'},
 {'PULocationID': '164', 'DOLocationID': '143'},
 {'PULocationID': '138', 'DOLocationID': '33'},
 {'PULocationID': '33', 'DOLocationID': '61'},
 {'PULocationID': '79', 'DOLocationID': '186'},
 {'PULocationID': '90', 'DOLocationID': '48'},
 {'PULocationID': '113', 'DOLocationID': '255'},
 {'PULocationID': '237', 'DOLocationID': '239'},
 {'PULocationID': '143', 'DOLocationID': '229'},
 {'PULocationID': '137', 'DOLocat

In [40]:
dict_vectorizer = DictVectorizer().fit(yellow_tripdata_jan_ids)

In [41]:
dict_vectorizer.vocabulary_

{'DOLocationID=1': 0,
 'DOLocationID=10': 1,
 'DOLocationID=100': 2,
 'DOLocationID=101': 3,
 'DOLocationID=102': 4,
 'DOLocationID=106': 5,
 'DOLocationID=107': 6,
 'DOLocationID=108': 7,
 'DOLocationID=109': 8,
 'DOLocationID=11': 9,
 'DOLocationID=111': 10,
 'DOLocationID=112': 11,
 'DOLocationID=113': 12,
 'DOLocationID=114': 13,
 'DOLocationID=115': 14,
 'DOLocationID=116': 15,
 'DOLocationID=117': 16,
 'DOLocationID=118': 17,
 'DOLocationID=119': 18,
 'DOLocationID=12': 19,
 'DOLocationID=120': 20,
 'DOLocationID=121': 21,
 'DOLocationID=122': 22,
 'DOLocationID=123': 23,
 'DOLocationID=124': 24,
 'DOLocationID=125': 25,
 'DOLocationID=126': 26,
 'DOLocationID=127': 27,
 'DOLocationID=128': 28,
 'DOLocationID=129': 29,
 'DOLocationID=13': 30,
 'DOLocationID=130': 31,
 'DOLocationID=131': 32,
 'DOLocationID=132': 33,
 'DOLocationID=133': 34,
 'DOLocationID=134': 35,
 'DOLocationID=135': 36,
 'DOLocationID=136': 37,
 'DOLocationID=137': 38,
 'DOLocationID=138': 39,
 'DOLocationID=1

In [42]:
yellow_tripdata_jan_ids_ohe = dict_vectorizer.transform(yellow_tripdata_jan_ids)
yellow_tripdata_jan_ids_ohe

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [43]:
linear_model = LinearRegression().fit(
    yellow_tripdata_jan_ids_ohe, duration_jan_minutes_not_outliers
)

In [46]:
rmse_train = root_mean_squared_error(
    duration_jan_minutes_not_outliers, linear_model.predict(yellow_tripdata_jan_ids_ohe)
)

In [47]:
rmse_train

7.649261822035489

In [50]:
duration_feb = (
    yellow_tripdata_feb_df["tpep_dropoff_datetime"]
    - yellow_tripdata_feb_df["tpep_pickup_datetime"]
)
duration_feb_minutes = duration_feb.map_elements(
    lambda v: v.total_seconds() / 60, return_dtype=pl.Float64
)
duration_feb_minutes

tpep_dropoff_datetime
f64
1.683333
0.233333
0.233333
32.083333
13.3
…
19.0
11.133333
14.0
7.0


In [53]:
duration_feb_minutes_not_outliers_mask = (duration_feb_minutes >= 1) & (
    duration_feb_minutes <= 60
)
duration_feb_minutes_not_outliers = duration_feb_minutes.filter(
    duration_feb_minutes_not_outliers_mask
)
duration_feb_minutes_not_outliers

tpep_dropoff_datetime
f64
1.683333
32.083333
13.3
14.633333
27.95
…
19.0
11.133333
14.0
7.0


In [54]:
yellow_tripdata_feb_ids = [
    {"PULocationID": str(row[0]), "DOLocationID": str(row[1])}
    for row in yellow_tripdata_feb_df.filter(duration_feb_minutes_not_outliers_mask)[
        ["PULocationID", "DOLocationID"]
    ].iter_rows()
]
yellow_tripdata_feb_ids

[{'PULocationID': '142', 'DOLocationID': '163'},
 {'PULocationID': '132', 'DOLocationID': '26'},
 {'PULocationID': '161', 'DOLocationID': '145'},
 {'PULocationID': '148', 'DOLocationID': '236'},
 {'PULocationID': '137', 'DOLocationID': '244'},
 {'PULocationID': '263', 'DOLocationID': '141'},
 {'PULocationID': '48', 'DOLocationID': '243'},
 {'PULocationID': '114', 'DOLocationID': '211'},
 {'PULocationID': '114', 'DOLocationID': '249'},
 {'PULocationID': '125', 'DOLocationID': '107'},
 {'PULocationID': '140', 'DOLocationID': '42'},
 {'PULocationID': '140', 'DOLocationID': '226'},
 {'PULocationID': '249', 'DOLocationID': '90'},
 {'PULocationID': '234', 'DOLocationID': '4'},
 {'PULocationID': '114', 'DOLocationID': '125'},
 {'PULocationID': '132', 'DOLocationID': '239'},
 {'PULocationID': '132', 'DOLocationID': '230'},
 {'PULocationID': '140', 'DOLocationID': '68'},
 {'PULocationID': '144', 'DOLocationID': '79'},
 {'PULocationID': '132', 'DOLocationID': '90'},
 {'PULocationID': '236', 'DOL

In [55]:
yellow_tripdata_feb_ids_ohe = dict_vectorizer.transform(yellow_tripdata_feb_ids)

In [57]:
val_rms = root_mean_squared_error(
    linear_model.predict(yellow_tripdata_feb_ids_ohe), duration_feb_minutes_not_outliers
)
val_rms

7.811821332387183