In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df_jan = pd.read_parquet("../data/yellow_tripdata_2022-01.parquet")
df_jan.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


# Q1. Read the data for January. 
How many columns are there?

In [4]:
df_jan.shape

(2463931, 19)

Answer = 19

# Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.
What's the standard deviation of the trips duration in January?

In [5]:
df_jan["duration"] = (
    df_jan["tpep_dropoff_datetime"] - df_jan["tpep_pickup_datetime"]
).dt.total_seconds() / 60

In [6]:
df_jan["duration"].std()

46.44530513776802

Answer = 46.45

# Q3. Dropping outliers

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [22]:
df_jan_cleaned = (
    df_jan.loc[(1 <= df_jan["duration"]) & (df_jan["duration"] <= 60)]
    .copy()
    .reset_index(drop=True)
)

In [24]:
100 * df_jan_cleaned.shape[0]/df_jan.shape[0]

98.27547930522405

# Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries
- Fit a dictionary vectorizer
- Get a feature matrix from it
- What's the dimensionality of this matrix (number of columns)?

In [9]:
categorical = ["PULocationID","DOLocationID"]
numerical = ["trip_distance"]
df_jan_cleaned[categorical] = df_jan_cleaned[categorical].astype(str)

In [10]:
dv = DictVectorizer()
train_dict = df_jan_cleaned[categorical + numerical].to_dict(orient="records")

In [11]:
X_train = dv.fit_transform(train_dict)

In [12]:
X_train
type(X_train)
# X_train is a scipy sparse matrix

scipy.sparse._csr.csr_matrix

In [13]:
# dimensionality of the matrix
len(dv.feature_names_)

516

# Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

Train a plain linear regression model with default parameters
Calculate the RMSE of the model on the training data
What's the RMSE on train?

In [14]:
target = "duration"

In [15]:
y_train = df_jan_cleaned[target].values
model = LinearRegression()
model.fit(X_train, y_train)

In [16]:
y_pred = model.predict(X_train)
print("RMSE: ")
np.sqrt(mean_squared_error(y_train, y_pred))

RMSE: 


7.001496179445599

# Q6. Evaluating the model

Now let's apply this model to the validation dataset (February 2022).

What's the RMSE on validation?

In [17]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df[target] = (
        df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    ).dt.total_seconds() / 60
    df_cleaned = (
        df.loc[(1 <= df[target]) & (df[target] <= 60)]
        .copy()
        .reset_index(drop=True)
    )
    return df_cleaned

In [18]:
df_feb_cleaned = read_dataframe("../data/yellow_tripdata_2022-02.parquet")

In [20]:
def validate(df, dv, model):
    val_dict = df[categorical + numerical].to_dict(orient="records")
    X_val = dv.transform(val_dict)
    y_val = df[target]
    y_val_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val_pred, y_val))
    return rmse

In [21]:
validate(df_feb_cleaned, dv, model)

12.302084931408636