In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [5]:
df_01_23 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [6]:
df_02_23 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [7]:
df_01_23.shape[1]

19

In [8]:
df_01_23['trip_duration_minutes'] = (df_01_23['tpep_dropoff_datetime'] - df_01_23['tpep_pickup_datetime']).dt.total_seconds() / 60


In [9]:
std_dev_01_23 = df_01_23['trip_duration_minutes'].std()

In [10]:
print(f"Standard Deviation of trip durations for Jan 2023: {std_dev_01_23:.2f} minutes")

Standard Deviation of trip durations for Jan 2023: 42.59 minutes


In [11]:
original_rows = df_01_23.shape[0]

In [12]:
df_01_23 = df_01_23[(df_01_23.trip_duration_minutes >= 1) & (df_01_23.trip_duration_minutes <= 60)]

In [13]:
print(f"fraction of the records left after dropping the outliers: {df_01_23.shape[0]/original_rows:%}")

fraction of the records left after dropping the outliers: 98.122028%


In [14]:
categorical = ['PULocationID', 'DOLocationID']

In [15]:
df_01_23[categorical] = df_01_23[categorical].astype(str)

In [16]:
train_dicts = df_01_23[categorical].to_dict(orient='records')

In [17]:
dv = DictVectorizer()

In [18]:
X_train = dv.fit_transform(train_dicts)

In [19]:
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [20]:
len(dv.get_feature_names_out())

515

In [21]:
target = 'trip_duration_minutes'

In [22]:
y_train = df_01_23[target]

In [23]:
lr = LinearRegression()

In [24]:
lr.fit(X_train, y_train)

LinearRegression()

In [25]:
y_pred = lr.predict(X_train)

In [26]:
rmse = np.sqrt(mean_squared_error(y_train, y_pred))

In [27]:
print(f"RMSE on training data: {rmse:.4f}")

RMSE on training data: 7.6493


In [29]:
#Create function to read and prepare data
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['trip_duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

    df = df[(df.trip_duration_minutes >= 1) & (df.trip_duration_minutes <= 60)]
    
    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype(str)
    
    return df

In [30]:
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [31]:
val_dicts = df_val[categorical].to_dict(orient='records')

In [32]:
X_val = dv.transform(val_dicts)

In [33]:
y_val = df_val[target].values

In [34]:
y_pred = lr.predict(X_val)

In [35]:
mean_squared_error(y_val, y_pred, squared=False)

7.811832641626525