In [96]:
import pandas as pd
import polars as pl
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [101]:
def read_dataframe(filename):
    df = pl.read_parquet(filename) \
        .with_columns(
            ((pl.col('tpep_dropoff_datetime') - pl.col('tpep_pickup_datetime')).dt.total_seconds()/60)
            .alias('duration'))
    return df.filter((pl.col('duration') >= 1) & (pl.col('duration') <= 60))

### Q1. Downloading the data

In [3]:
date = '2023-01'

link = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{date}.parquet'

df = pl.read_parquet(link)

In [9]:
df.columns

['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'airport_fee']

In [10]:
len(df.columns)

19

### Q2. Computing duration

In [61]:
df = df.with_columns(
    ((pl.col('tpep_dropoff_datetime') - pl.col('tpep_pickup_datetime')).dt.total_seconds()/60) \
    .alias('duration'))

In [63]:
df['duration'].std()

42.59435124195457

In [78]:
df['duration'].describe(percentiles=[i/10 for i in range(2, 10, 2)])

statistic,value
str,f64
"""count""",3066766.0
"""null_count""",0.0
"""mean""",15.668995
"""std""",42.594351
"""min""",-29.2
"""20%""",6.283333
"""40%""",9.65
"""60%""",13.733333
"""80%""",20.533333
"""max""",10029.183333


### Q3. Dropping outliers

In [64]:
df_filtered = df.filter((pl.col('duration') >= 1) & (pl.col('duration') <= 60))

In [65]:
len(df_filtered)/len(df)

0.9812202822125979

### Q4. One-hot encoding

In [84]:
categorical = ['PULocationID', 'DOLocationID']

train_dicts = df_filtered[categorical].cast(str).to_dicts()

In [87]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6018346 stored elements and shape (3009173, 515)>

### Q5. Training a model

In [93]:
target = 'duration'
y_train = df_filtered[target].to_numpy()

lr = LinearRegression()
lr.fit(X_train, y_train)

In [99]:
y_pred = lr.predict(X_train)
rmse = root_mean_squared_error(y_train, y_pred)
rmse

7.649262052602302

### Q6. Evaluating the model

In [104]:
date = '2023-02'

df_val = read_dataframe(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{date}.parquet')

In [105]:
val_dicts = df_val[categorical].cast(str).to_dicts()
X_val = dv.transform(val_dicts)

In [106]:
y_pred = lr.predict(X_val)
y_val = df_val[target].to_numpy()
rmse = root_mean_squared_error(y_val, y_pred)
rmse

7.81181466707864