# 0. Setup

* Libs 

In [24]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# Q1. Downloading the data

In [2]:
jan_yellow_recs = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
feb_yellow_recs = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")

In [3]:
print(f"There are {jan_yellow_recs.shape[1]} columns in the table for January 2023.")

There are 19 columns in the table for January 2023.


# Q2. Computing duration

In [5]:
# Casting columns to datetime type
jan_yellow_recs["tpep_dropoff_datetime"] = pd.to_datetime(jan_yellow_recs["tpep_dropoff_datetime"])
jan_yellow_recs["tpep_pickup_datetime"] = pd.to_datetime(jan_yellow_recs["tpep_pickup_datetime"])

In [6]:
# Computing the duration variable
jan_yellow_recs["duration"] = (jan_yellow_recs["tpep_dropoff_datetime"] - jan_yellow_recs["tpep_pickup_datetime"]).dt.total_seconds() / 60

In [7]:
# Answering the homework question
print(f"The standard deviation of the trips duration in Janueary is {jan_yellow_recs["duration"].std():.2f}")

The standard deviation of the trips duration in Janueary is 42.59


# Q3. Dropping outliers

In [8]:
outlier_filter = np.bitwise_and((1 <= jan_yellow_recs["duration"]), (60 >= jan_yellow_recs["duration"]))

In [9]:
jan_yellow_recs_no_outliers = jan_yellow_recs.loc[outlier_filter, :]

In [10]:
print(f"Fraction of the records left after dropping outliers: {(jan_yellow_recs_no_outliers.shape[0] / jan_yellow_recs.shape[0]):.2f}")

Fraction of the records left after dropping outliers: 0.98


# Q4. One-hot encoding

In [11]:
# Casting IDs to strings
jan_yellow_recs_no_outliers.loc[:, "PULocationID"] = jan_yellow_recs_no_outliers["PULocationID"].astype(str)
jan_yellow_recs_no_outliers.loc[:, "DOLocationID"] = jan_yellow_recs_no_outliers["DOLocationID"].astype(str)

  jan_yellow_recs_no_outliers.loc[:, "PULocationID"] = jan_yellow_recs_no_outliers["PULocationID"].astype(str)
  jan_yellow_recs_no_outliers.loc[:, "DOLocationID"] = jan_yellow_recs_no_outliers["DOLocationID"].astype(str)


In [36]:
locations_ids = jan_yellow_recs_no_outliers[["PULocationID", "DOLocationID"]]

In [46]:
vectorizer = OneHotEncoder(handle_unknown="ignore")
X = vectorizer.fit_transform(locations_ids)

In [38]:
print(f"The number of columns of the feature matrix is {X.shape[1]}")

The number of columns of the feature matrix is 515


# Q5. Training a model

In [39]:
y = jan_yellow_recs_no_outliers["duration"]

In [40]:
X.shape, y.shape

((3009173, 515), (3009173,))

In [41]:
model = LinearRegression()
model.fit(X, y)

In [42]:
y_pred = model.predict(X)

In [43]:
rmse = root_mean_squared_error(y_true=y, y_pred=y_pred)

In [44]:
print(f"The model's RMSE is: {rmse:.2f}")

The model's RMSE is: 7.65


# Q6. Evaluating the model

In [47]:
# Preprocessing February data

# Casting columns to datetime type
feb_yellow_recs["tpep_dropoff_datetime"] = pd.to_datetime(feb_yellow_recs["tpep_dropoff_datetime"])
feb_yellow_recs["tpep_pickup_datetime"] = pd.to_datetime(feb_yellow_recs["tpep_pickup_datetime"])

# Computing the duration variable
feb_yellow_recs["duration"] = (feb_yellow_recs["tpep_dropoff_datetime"] - feb_yellow_recs["tpep_pickup_datetime"]).dt.total_seconds() / 60

# Dropping outliers
feb_outlier_filter = np.bitwise_and((1 <= feb_yellow_recs["duration"]), (60 >= feb_yellow_recs["duration"]))
feb_yellow_recs_no_outliers = feb_yellow_recs.loc[feb_outlier_filter, :]

# Casting IDs to strings
feb_yellow_recs_no_outliers.loc[:, "PULocationID"] = feb_yellow_recs_no_outliers["PULocationID"].astype(str)
feb_yellow_recs_no_outliers.loc[:, "DOLocationID"] = feb_yellow_recs_no_outliers["DOLocationID"].astype(str)
feb_locations_ids = feb_yellow_recs_no_outliers[["PULocationID", "DOLocationID"]]

X_feb = vectorizer.transform(feb_locations_ids, )

  feb_yellow_recs_no_outliers.loc[:, "PULocationID"] = feb_yellow_recs_no_outliers["PULocationID"].astype(str)
  feb_yellow_recs_no_outliers.loc[:, "DOLocationID"] = feb_yellow_recs_no_outliers["DOLocationID"].astype(str)


In [48]:
y_feb = feb_yellow_recs_no_outliers["duration"]

In [49]:
X_feb.shape, y_feb.shape

((2855951, 515), (2855951,))

In [50]:
y_pred_feb = model.predict(X_feb)

In [51]:
feb_rmse = root_mean_squared_error(y_true=y_feb, y_pred=y_pred_feb)

In [52]:
print(f"The RMSE on validation dataset is {feb_rmse}")

The RMSE on validation dataset is 7.811817578299112
