## Q1. Downloading the data

In [2]:
import pandas as pd

df = pd.read_parquet('data/yellow_tripdata_2023-02.parquet')
print(len(df.columns))

19


## Q2. Computing duration

In [3]:
january_data = pd.read_parquet('data/yellow_tripdata_2023-01.parquet')

january_data['duration'] = (pd.to_datetime(january_data['tpep_dropoff_datetime']) - pd.to_datetime(january_data['tpep_pickup_datetime'])).dt.total_seconds() / 60
duration_std = january_data['duration'].std()
print(f"The standard deviation of the trip duration in January is {duration_std:.2f} minutes.")

The standard deviation of the trip duration in January is 42.59 minutes.


## Q3. Dropping outliers

In [4]:
filtered_data = january_data[(january_data['duration'] >= 1) & (january_data['duration'] <= 60)]
fraction_left = len(filtered_data) / len(january_data) * 100
print(f"The fraction of records left after dropping outliers is {fraction_left:.0f}%.")


The fraction of records left after dropping outliers is 98%.


## Q4. One-hot encoding

In [5]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

filtered_data['PULocationID'] = filtered_data['PULocationID'].astype(str)
filtered_data['DOLocationID'] = filtered_data['DOLocationID'].astype(str)


data_dicts = filtered_data[['PULocationID', 'DOLocationID']].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X = dv.fit_transform(data_dicts)

num_columns = X.shape[1]

print(f"The dimensionality of the feature matrix is {num_columns} columns.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['PULocationID'] = filtered_data['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['DOLocationID'] = filtered_data['DOLocationID'].astype(str)


The dimensionality of the feature matrix is 515 columns.


## Q5. Training a model

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
y = filtered_data['duration'].values

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

rmse = np.sqrt(mean_squared_error(y, y_pred))

print(f"The RMSE of the model on the training data is {rmse:.2f}.")

## Q6. Evaluating the model

In [None]:
february_data = pd.read_parquet('path_to_february_file.parquet')

# Compute the duration in minutes for February data
february_data['duration'] = (pd.to_datetime(february_data['tpep_dropoff_datetime']) - pd.to_datetime(february_data['tpep_pickup_datetime'])).dt.total_seconds() / 60

# Filter the data to remove outliers
filtered_february_data = february_data[(february_data['duration'] >= 1) & (february_data['duration'] <= 60)]

filtered_february_data['PULocationID'] = filtered_february_data['PULocationID'].astype(str)
filtered_february_data['DOLocationID'] = filtered_february_data['DOLocationID'].astype(str)

february_data_dicts = filtered_february_data[['PULocationID', 'DOLocationID']].to_dict(orient='records')

X_val = dv.transform(february_data_dicts)

y_val = filtered_february_data['duration'].values

y_val_pred = model.predict(X_val)

rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"The RMSE of the model on the validation data is {rmse_val:.2f}.")
