In [1]:
!python -V

Python 3.9.12


In [2]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_parquet('../notebooks/data/yellow_tripdata_2023-01.parquet')

In [4]:
print(df.head())
print(df.info())
print(df.describe())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2023-01-01 00:32:10   2023-01-01 00:40:36              1.0   
1         2  2023-01-01 00:55:08   2023-01-01 01:01:27              1.0   
2         2  2023-01-01 00:25:04   2023-01-01 00:37:49              1.0   
3         1  2023-01-01 00:03:48   2023-01-01 00:13:25              0.0   
4         2  2023-01-01 00:10:29   2023-01-01 00:21:19              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           0.97         1.0                  N           161           141   
1           1.10         1.0                  N            43           237   
2           2.51         1.0                  N            48           238   
3           1.90         1.0                  N           138             7   
4           1.43         1.0                  N           107            79   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [5]:
num_cols = len(df.columns)
print(f"The dataset has {num_cols} columns.")

The dataset has 19 columns.


In [6]:
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
std_duration = df['duration'].std()

print(f"The standard deviation of trip durations in January is {std_duration:.2f} minutes.")

The standard deviation of trip durations in January is 42.59 minutes.


In [7]:
df_filtered = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
fraction_remaining = len(df_filtered) / len(df)

print(f"The fraction of remaining records is {fraction_remaining:.2%}")

The fraction of remaining records is 98.12%


In [8]:
df_filtered['PULocationID'] = df_filtered['PULocationID'].astype(str)
df_filtered['DOLocationID'] = df_filtered['DOLocationID'].astype(str)

dict_list = df_filtered[['PULocationID', 'DOLocationID']].to_dict('records')

vectorizer = DictVectorizer()

X = vectorizer.fit_transform(dict_list)
n_cols = X.shape[1]

print(f"The dimensionality of the feature matrix is {n_cols} columns.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['PULocationID'] = df_filtered['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['DOLocationID'] = df_filtered['DOLocationID'].astype(str)


The dimensionality of the feature matrix is 515 columns.


In [9]:
X = vectorizer.fit_transform(dict_list)
y = df_filtered['duration']

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
rmse = mean_squared_error(y, y_pred, squared=False)

print(f"The RMSE on training is {rmse:.2f}")

The RMSE on training is 7.65


In [10]:
df_val = pd.read_parquet('../notebooks/data/yellow_tripdata_2023-02.parquet')

In [11]:
print(df.head())
print(df.info())
print(df.describe())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2023-01-01 00:32:10   2023-01-01 00:40:36              1.0   
1         2  2023-01-01 00:55:08   2023-01-01 01:01:27              1.0   
2         2  2023-01-01 00:25:04   2023-01-01 00:37:49              1.0   
3         1  2023-01-01 00:03:48   2023-01-01 00:13:25              0.0   
4         2  2023-01-01 00:10:29   2023-01-01 00:21:19              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           0.97         1.0                  N           161           141   
1           1.10         1.0                  N            43           237   
2           2.51         1.0                  N            48           238   
3           1.90         1.0                  N           138             7   
4           1.43         1.0                  N           107            79   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [12]:
df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60
df_val_filtered = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

df_val_filtered['PULocationID'] = df_val_filtered['PULocationID'].astype(str)
df_val_filtered['DOLocationID'] = df_val_filtered['DOLocationID'].astype(str)

dict_list_val = df_val_filtered[['PULocationID', 'DOLocationID']].to_dict('records')

X_val = vectorizer.transform(dict_list_val)
y_val = df_val_filtered['duration']

y_pred_val = model.predict(X_val)

rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)

print(f"The RMSE on validation is {rmse_val:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val_filtered['PULocationID'] = df_val_filtered['PULocationID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val_filtered['DOLocationID'] = df_val_filtered['DOLocationID'].astype(str)


The RMSE on validation is 7.81
