In [17]:
!python -V

Python 3.12.6


In [18]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [19]:
# Read the data for January. How many columns are there?

In [20]:
df_january = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
number_of_columns = len(df_january.columns)
number_of_columns

19

In [21]:
# What's the standard deviation of the trips duration in January?

In [22]:
df_january['duration'] = df_january.tpep_dropoff_datetime - df_january.tpep_pickup_datetime
df_january.duration = df_january.duration.apply(lambda td: td.total_seconds() / 60)
df_january['duration'].std()

np.float64(42.59435124195458)

In [23]:
# Next, we need to check the distribution of the duration variable. 
# There are some outliers. 
# Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).
# What fraction of the records left after you dropped the outliers?

In [24]:
original_count = len(df_january)
df_january_filtered = df_january[(df_january.duration >= 1) & (df_january.duration <= 60)]
filtered_count = len(df_january_filtered)
fraction_remaining = filtered_count / original_count

# Fraction of records left
original_count, filtered_count, fraction_remaining

(3066766, 3009173, 0.9812202822125979)

In [25]:
#Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.
#Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
#Fit a dictionary vectorizer
#Get a feature matrix from it
#What's the dimensionality of this matrix (number of columns)?

In [26]:
# Convert IDs to strings to ensure one-hot encoding instead of label encoding
# Use only the pickup and dropoff columns
categorical = ['PULocationID', 'DOLocationID']
df_january_filtered[categorical] = df_january_filtered[categorical].astype(str)

# Turn into list of dictionaries
dicts = df_january_filtered[categorical].to_dict(orient='records')

# Fit DictVectorizer
dv = DictVectorizer()
X = dv.fit_transform(dicts)

# Check dimensionality
X.shape[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_january_filtered[categorical] = df_january_filtered[categorical].astype(str)


515

In [27]:
# Now let's use the feature matrix from the previous step to train a model.
# Train a plain linear regression model with default parameters, where duration is the response variable
# Calculate the RMSE of the model on the training data
# What's the RMSE on train?

In [28]:
y_train = df_january_filtered['duration'].values

# Train linear regression
model = LinearRegression()
model.fit(X, y_train)

# Predict on training data
y_pred = model.predict(X)

# Calculate RMSE
rmse = root_mean_squared_error(y_train, y_pred)
print(f"RMSE on training data: {rmse:.2f}")


RMSE on training data: 7.65


In [29]:
# Now let's apply this model to the validation dataset (February 2023).
# What's the RMSE on validation?

In [33]:
df_february = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

# Compute duration in minutes
df_february['duration'] = df_february.tpep_dropoff_datetime - df_february.tpep_pickup_datetime
df_february['duration'] = df_february['duration'].apply(lambda td: td.total_seconds() / 60)

# Filter outliers: keep only trips between 1 and 60 minutes
df_february_filtered = df_february[(df_february['duration'] >= 1) & (df_february['duration'] <= 60)]

# Prepare categorical features
df_february_filtered[categorical] = df_february_filtered[categorical].astype(str)

# Convert to list of dictionaries
dicts_val = df_february_filtered[categorical].to_dict(orient='records')

# Transform using the fitted DictVectorizer (from January)
X_val = dv.transform(dicts_val)

# Target variable
y_val = df_february_filtered['duration']

# Predict and calculate RMSE using the trained model
y_pred_val = model.predict(X_val)
rmse_val = root_mean_squared_error(y_val, y_pred_val)

# Output the result
print(f"RMSE on validation data: {rmse_val:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_february_filtered[categorical] = df_february_filtered[categorical].astype(str)


RMSE on validation data: 7.81
