# MLOps Zoomcamp 2024

# Homework 1

# Install libraries

In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Q1. Downloading the data

We'll use [the same NYC taxi dataset](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page), but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".

Download the data for January and February 2023.

Read the data for January. How many columns are there?

In [3]:
path_to_data_yellow_jan_2023 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
path_to_data_yellow_feb_2023 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

In [4]:
# Read in the data
df_jan_2023 = pd.read_parquet(path_to_data_yellow_jan_2023)
df_feb_2023 = pd.read_parquet(path_to_data_yellow_feb_2023)

In [5]:
df_jan_2023.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0


In [6]:
print('Rows: {:,}'.format(df_jan_2023.shape[0]))
print('Columns: {}'.format(df_jan_2023.shape[1]))

Rows: 3,066,766
Columns: 19


# Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the standard deviation of the trips duration in January?

In [7]:
# Check the types of the columns
df_jan_2023[['tpep_pickup_datetime', 'tpep_dropoff_datetime']].dtypes

tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
dtype: object

In [8]:
# Create the duration column in minutes
df_jan_2023['duration'] = df_jan_2023['tpep_dropoff_datetime'] - df_jan_2023['tpep_pickup_datetime']

In [9]:
%%time

df_jan_2023['duration_mins'] = df_jan_2023['duration'].apply(lambda td: td.total_seconds() / 60)

CPU times: user 15.8 s, sys: 356 ms, total: 16.2 s
Wall time: 16.3 s


In [10]:
df_jan_2023.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,duration_mins
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,0 days 00:08:26,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,...,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,0 days 00:06:19,6.316667


In [11]:
duration_mins_std_jan = df_jan_2023['duration_mins'].std()

print('Standard deviation of the trips duration in January: {}'.format(duration_mins_std_jan))

Standard deviation of the trips duration in January: 42.594351241920904


# Q3. Dropping outliers

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [12]:
prop_of_data_left = ((df_jan_2023['duration_mins'] >= 1) & (df_jan_2023['duration_mins'] <= 60)).mean()

print('Percentage of the records left after dropping the outliers: {:.2%}'.format(prop_of_data_left))

Percentage of the records left after dropping the outliers: 98.12%


In [13]:
# Drop the outliers
print('Rows before dropping outliers: {:,}'.format(df_jan_2023.shape[0]))
df_jan_2023 = df_jan_2023[(df_jan_2023['duration_mins'] >= 1) & (df_jan_2023['duration_mins'] <= 60)]
print('Rows after dropping outliers: {:,}'.format(df_jan_2023.shape[0]))

Rows before dropping outliers: 3,066,766
Rows after dropping outliers: 3,009,173


# Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them).

Fit a dictionary vectorizer. Get a feature matrix from it. What's the dimensionality of this matrix (number of columns)?

In [14]:
df_jan_2023.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration',
       'duration_mins'],
      dtype='object')

In [15]:
categorical = ['PULocationID', 'DOLocationID']

df_jan_2023[categorical].dtypes

PULocationID    int64
DOLocationID    int64
dtype: object

In [16]:
# Cast the categorical variables into a string
df_jan_2023[categorical] = df_jan_2023[categorical].astype(str)

df_jan_2023[categorical].dtypes

PULocationID    object
DOLocationID    object
dtype: object

In [17]:
%%time

# DictVectorizer requires a dictionary as an input. Convert variables into a dictionary
train_dicts = df_jan_2023[categorical].to_dict(orient='records')

CPU times: user 4.98 s, sys: 323 ms, total: 5.3 s
Wall time: 5.35 s


In [18]:
%%time

# Create feature matrix that will be used to train the model
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

CPU times: user 4.34 s, sys: 136 ms, total: 4.48 s
Wall time: 4.53 s


In [19]:
print('Rows of the training feature matrix: {}'.format(X_train.shape[0]))
print('Columns of the training feature matrix: {}'.format(X_train.shape[1]))

Rows of the training feature matrix: 3009173
Columns of the training feature matrix: 515


# Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters. Calculate the RMSE of the model on the training data. 

What's the RMSE on train?

In [20]:
# Create the target variable values
target = 'duration_mins'
y_train = df_jan_2023[target].values

In [21]:
%%time

# Fit the Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

CPU times: user 23.1 s, sys: 18.2 s, total: 41.4 s
Wall time: 41.9 s


LinearRegression()

In [22]:
%%time 

# Create predictions on the validation set
y_pred = lr.predict(X_train)

CPU times: user 9.28 ms, sys: 2.97 ms, total: 12.2 ms
Wall time: 11.7 ms


In [23]:
%%time

# Calculate the error
rmse = mean_squared_error(y_train, y_pred, squared=False)

print('RMSE on train: {}'.format(rmse))

RMSE on train: 7.6492610279057605
CPU times: user 4.94 ms, sys: 3.43 ms, total: 8.37 ms
Wall time: 8.24 ms


# Q6. Evaluating the model

Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [24]:
df_feb_2023.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0


In [25]:
df_feb_2023[['tpep_dropoff_datetime', 'tpep_pickup_datetime']].dtypes

tpep_dropoff_datetime    datetime64[ns]
tpep_pickup_datetime     datetime64[ns]
dtype: object

In [26]:
%%time

# Create the duration variable
df_feb_2023['duration'] = df_feb_2023['tpep_dropoff_datetime'] - df_feb_2023['tpep_pickup_datetime']
df_feb_2023['duration_mins'] = df_feb_2023['duration'].apply(lambda td: td.total_seconds() / 60)

# Drop the outliers
df_feb_2023 = df_feb_2023[(df_feb_2023['duration_mins'] >= 1) & (df_feb_2023['duration_mins'] <= 60)]

CPU times: user 15.1 s, sys: 508 ms, total: 15.6 s
Wall time: 15.8 s


In [27]:
# Cast categorical variables
categorical = ['PULocationID', 'DOLocationID']
df_feb_2023[categorical] = df_feb_2023[categorical].astype(str)

In [28]:
%%time

# DictVectorizer requires a dictionary as an input. Convert variables into a dictionary
val_dicts = df_feb_2023[categorical].to_dict(orient='records')

# Create feature matrix
X_val = dv.transform(val_dicts)

CPU times: user 8.9 s, sys: 625 ms, total: 9.52 s
Wall time: 9.59 s


In [30]:
# Create the target variable values
target = 'duration_mins'
y_val = df_feb_2023[target].values

In [31]:
# Create predictions on the validation set
y_pred = lr.predict(X_val)

In [33]:
# Calculate the error
rmse_val = mean_squared_error(y_val, y_pred, squared=False)

print('RMSE on validation: {}'.format(rmse_val))

RMSE on validation: 7.81183265470218
