In [1]:
!python -V

Python 3.12.3


In [2]:
import pandas as pd
import numpy as np

# Training

## Reading data for January

In [3]:
yellow = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

In [4]:
yellow.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

## Computing duration
What's the standard deviation of the trips duration in January?

In [5]:
yellow['duration'] = yellow.tpep_dropoff_datetime - yellow.tpep_pickup_datetime
yellow.duration = yellow.duration.apply(lambda td: td.total_seconds() / 60)

In [6]:
yellow.duration.std()

np.float64(42.59435124195458)

In [7]:
pd.options.display.float_format = '{:20,.2f}'.format

In [8]:
yellow.duration.describe()

count           3,066,766.00
mean                   15.67
std                    42.59
min                   -29.20
25%                     7.12
50%                    11.52
75%                    18.30
max                10,029.18
Name: duration, dtype: float64

In [9]:
yellow.duration.describe(percentiles = [0.95, 0.98, 0.99])

count           3,066,766.00
mean                   15.67
std                    42.59
min                   -29.20
50%                    11.52
95%                    36.47
98%                    48.73
99%                    57.25
max                10,029.18
Name: duration, dtype: float64

## Dropping outliers

In [10]:
# Let's remove them and keep only the records where 
# the duration was between 1 and 60 minutes (inclusive).
yellow = yellow[(yellow.duration >= 1) & (yellow.duration <= 60)]

In [11]:
yellow.duration.describe()

count           3,009,173.00
mean                   14.20
std                     9.94
min                     1.00
25%                     7.22
50%                    11.55
75%                    18.18
max                    60.00
Name: duration, dtype: float64

In [12]:
# What fraction of the records left after you dropped the outliers?
3009173/3066766*100

98.1220282212598

## One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

* Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
* Fit a dictionary vectorizer
* Get a feature matrix from it  

What's the dimensionality of this matrix (number of columns)?

In [13]:
categorical = ['PULocationID', 'DOLocationID']
yellow[categorical] = yellow[categorical].astype(str)
# train_dicts = df[categorical + numerical].to_dict(orient='records')
train_dicts = yellow[categorical].to_dict(orient='records')

In [14]:
len(train_dicts)

3009173

In [15]:
type(train_dicts)

list

In [16]:
len(train_dicts[0])

2

In [17]:
train_dicts[0].keys()

dict_keys(['PULocationID', 'DOLocationID'])

In [18]:
train_dicts[0].items()

dict_items([('PULocationID', '161'), ('DOLocationID', '141')])

In [19]:
# perform vectorisation with inbuild sklearn method
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [20]:
# first 5 feature names
dv.get_feature_names_out()[:5]

array(['DOLocationID=1', 'DOLocationID=10', 'DOLocationID=100',
       'DOLocationID=101', 'DOLocationID=102'], dtype=object)

In [21]:
print(X_train[0])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 515)>
  Coords	Values
  (0, 43)	1.0
  (0, 325)	1.0


## Training a model
Now let's use the feature matrix from the previous step to train a model.

* Train a plain linear regression model with default parameters, where duration is the response variable
* Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

target = 'duration'
y_train = yellow[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

7.6492624397080675

## Evaluating the model
Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [26]:
def data_pre_processing(filename, target = 'duration'):
    df = pd.read_parquet(filename)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df[categorical], df[target].values

In [27]:
X_train, y_train = data_pre_processing('./data/yellow_tripdata_2023-01.parquet')
X_test, y_test = data_pre_processing('./data/yellow_tripdata_2023-02.parquet')

In [28]:
len(X_train), len(y_train)

(3009173, 3009173)

In [29]:
len(X_test), len(y_test)

(2855951, 2855951)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

pipe = Pipeline([('hot_encoding', OneHotEncoder(sparse_output=True, handle_unknown='ignore')), 
                 ('model', LinearRegression())])

In [42]:
pipe.fit(X_train, y_train).score(X_test, y_test)

0.39751850397006006

In [43]:
y_pred = pipe.predict(X_test)

In [45]:
root_mean_squared_error(y_test, y_pred)

7.811819746035725

In [46]:
import sklearn

In [47]:
sklearn.__version__

'1.6.1'